# Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.axes_grid.anchored_artists import AnchoredText
import re
from xml.etree import ElementTree
import folium
import progressbar
from anonymizingdata import AnonymizingData
%matplotlib inline

In [2]:
# Setting up nice graph formatting
mpl.rcParams.update({
    'font.size'           : 20.0,
    'axes.titlesize'      : 'large',
    'axes.labelsize'      : 'medium',
    'xtick.labelsize'     : 'small',
    'ytick.labelsize'     : 'small',
    'legend.fontsize'     : 'small',
})

In [3]:
# Force pandas & numpy to display all data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_info_columns', 100000)
pd.set_option('max_seq_items', None)
np.set_printoptions(threshold=np.nan)

In [4]:
# Importing hidden anonymous functions
hidden = AnonymizingData()

## Meta Data Cleaning

In [None]:
meta_df = pd.read_csv('../data/EDD/meta-data.csv', error_bad_lines=False)

In [6]:
# Cutting rows that are random code
meta_df = meta_df[[type(x)==int or x.isdigit() for x in meta_df['meta_id']]]

In [7]:
# Creating list of payments that are really just renewals
renewals = list(meta_df[meta_df['meta_key'] == '_edd_sl_is_renewal']['post_id'])

In [8]:
# Creating df of active/inactive statuses
# Come back to this problem
ids = list(meta_df[meta_df['meta_key'] == '_edd_sl_status']['post_id'])
statuses = list(meta_df[meta_df['meta_key'] == '_edd_sl_status']['meta_value'])
statuses = zip(ids, statuses)
status_df = pd.DataFrame(statuses, columns=['post_id', 'license_status'])

## Customer Revenue Data Cleaning

In [9]:
revenue_df = pd.read_csv('../data/EDD/customer-revenue.csv')

In [10]:
# Renaming columns for datasource
revenue_df.rename(columns=lambda x: ('revenue:' + x).strip().lower().replace(" ", "_"), inplace=True)

In [11]:
# Parsing email to get domain
lst = []
for item in revenue_df['revenue:email']:
    lst.append(item.partition('@')[2])
revenue_df['revenue:domain'] = lst

## Transaction Data Cleaning

In [12]:
EDD_df = pd.read_pickle('../data/google-analytics/all_transactions_google_EDD')

In [13]:
# Keeping completed statuses only
EDD_df = EDD_df[EDD_df['EDD:Status'] == 'complete']

In [14]:
# Keeping payments that are not renewals only
EDD_df['edd:renewal_flag'] = [x in renewals for x in EDD_df['EDD:Payment ID']]
EDD_df = EDD_df[EDD_df['edd:renewal_flag'] == False]
EDD_df.drop('edd:renewal_flag', inplace=True, axis=1)

In [15]:
# Dropping test payment rows
EDD_df = EDD_df[EDD_df['EDD:Payment Method'] != 'Test Payment']

In [16]:
# Create licenses variable
licenses = []
for item in EDD_df['EDD:Products (Verbose)']:
    try:
        # Searching for name of license within column value
        licenses.append(set(re.findall("(\S+)\s*License", item)))
    except:
        licenses.append(set())
        continue
EDD_df['EDD:licenses'] = licenses

# Hidden anonymizing function for different licenses
EDD_df = hidden.licenses(EDD_df)

In [17]:
# Change discount codes to boolean
EDD_df['EDD:used_code'] = EDD_df['EDD:Discount Code'] != 'none'

In [18]:
# Grouping by customer (using email) & summing amounts spent
total_df = pd.DataFrame(EDD_df.groupby('EDD:Email').sum()['EDD:Amount ($)'])
# Renaming indices to match
total_df.rename(index = str, columns = {'EDD:Amount ($)': 'EDD:total_spent'}, inplace=True)
# Resetting email variable after transformation
total_df['EDD:Email'] = total_df.index
# Merging the temp df in with the original df
EDD_df = pd.merge(EDD_df, total_df, how='left', on='EDD:Email')
# Dropping the amount per transaction and only having the total spent per customer variable
EDD_df.drop('EDD:Amount ($)', inplace=True, axis=1)

In [19]:
# Making dummy variables 
temp_df = EDD_df[['EDD:Email', 'EDD:licenses', 'EDD:Payment Method']]
# Making dummies for licenses and payment methods
temp_df = pd.get_dummies(temp_df, dummy_na=True, \
                             columns=['EDD:licenses', 'EDD:Payment Method'])
# Consolidating back to one row per customer email
temp_df = temp_df.groupby('EDD:Email').max()
# Resetting email variable
temp_df['EDD:Email'] = temp_df.index
# Merging data back in
EDD_df = pd.merge(EDD_df, temp_df, how='left', on='EDD:Email')
# Dropping original columns
EDD_df.drop(['EDD:licenses', 'EDD:Payment Method'], inplace=True, axis=1)

In [20]:
# Renaming columns for easier use
EDD_df.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

In [21]:
# Figure out a better way of doing this later?
EDD_df = EDD_df.groupby('edd:email').first().reset_index()

## Intercom Data Cleaning

In [22]:
intercom_df = pd.read_csv('../data/intercom/intercom-9-1-17.csv')

In [23]:
# Removing duplicate records
intercom_df.sort_values('First Seen (MDT)')
intercom_df = intercom_df.groupby('Email').first().reset_index()

In [24]:
# Renaming columns for datasource
intercom_df.rename(columns=lambda x: ('intercom:' + x).strip().lower().replace(" ", "_"), inplace=True)

## Drip Data Cleaning

In [25]:
drip_df = pd.read_csv('../data/drip/drip-subscribers.csv')

In [26]:
# Renaming columns
drip_df.rename(columns=lambda x: ('drip:' + x).strip().lower().replace(" ", "_"), inplace=True)

## HubSpot Customer Data Cleaning

In [27]:
hub_cust_df = pd.read_csv('../data/hubspot/hubspot-crm-view-contacts-all-contacts-2017-09-02.csv')

In [28]:
# Renaming columns
hub_cust_df.rename(columns=lambda x: ('hubcust:' + x).strip().lower().replace(" ", "_"), inplace=True)

## HubSpot Company Data Cleaning

In [29]:
hub_comp_df = pd.read_csv('../data/hubspot/hubspot-crm-view-companies-all-companies-2017-09-02.csv')

In [30]:
# Renaming columns
hub_comp_df.rename(columns=lambda x: ('hubcomp:' + x).strip().lower().replace(" ", "_"), inplace=True)

## Mechancial Turk Data Cleaning

In [31]:
turk_df = pd.read_csv('../data/mechanical-turk/MechanicalTurkData.csv')

In [32]:
# Filtering only for those respondents who actually got the page to work
turk_df = turk_df[turk_df['Answer.can_access'] == 'yes']
turk_df = turk_df[turk_df['Answer.does_it_load'] == 'yes']

In [33]:
# Pulling out the only three variables that are useful
turk_df = turk_df[['Input.website_url', 'Answer.industry', 'Answer.well-made']]

In [34]:
# Pulling the website domain out of the website URL for use in joining later
lst = []
for item in turk_df['Input.website_url']:
    item = item.partition('//')[2]
    item = item.partition('/')[0]
    lst.append(item.replace('www.',''))
turk_df['turk_domain'] = lst

In [35]:
# Making an average rating variable
avg_df = pd.DataFrame(turk_df.groupby('Input.website_url').mean()['Answer.well-made'])
avg_df['Input.website_url'] = avg_df.index
turk_df = pd.merge(turk_df, avg_df, how='left', on='Input.website_url')
turk_df.drop('Answer.well-made_x', inplace=True, axis=1)

In [36]:
# Making dummy variables for each industry
turk_df = pd.get_dummies(turk_df, drop_first=True, dummy_na=True, columns=['Answer.industry'])

In [37]:
# Renaming columns
turk_df.rename(columns=lambda x: ('turk:' + x).strip().lower().replace(" ", "_"), inplace=True)

## HelpScout Data Cleaning

In [38]:
help_scout_df = pd.read_pickle('../data/help-scout/helpscout_simplified')

In [39]:
# Renaming columns
help_scout_df.rename(columns=lambda x: ('helpscout:' + x).strip().lower().replace(" ", "_"), inplace=True)

In [40]:
# Taking emails out of list form
lst = []
for x in help_scout_df['helpscout:emails']:
    try:
        lst.append(x[0])
    except:
        lst.append(x)
help_scout_df['helpscout:emails'] = lst
# Fixing unicode problem
help_scout_df['helpscout:emails'] = help_scout_df['helpscout:emails'].astype(str)

## Joined Single Dataset Cleaning & EDA

In [76]:
# Joining all datasets
df = pd.merge(revenue_df, EDD_df, left_on='revenue:id', right_on='edd:customer_id', how='left')
df = pd.merge(df, intercom_df, how='left', left_on='revenue:email', right_on='intercom:email')
df = pd.merge(df, drip_df, how='left', left_on='revenue:email', right_on='drip:email')
df = pd.merge(df, hub_cust_df, how='left', left_on='revenue:email', right_on='hubcust:email') # 41% null
df = pd.merge(df, hub_comp_df, how='left', left_on='revenue:domain', right_on='hubcomp:company_domain_name') # 53% null
df = pd.merge(df, turk_df, how='left', left_on='revenue:domain', right_on='turk:turk_domain') # 88% null
df = pd.merge(df, help_scout_df, how='left', left_on='revenue:email', right_on='helpscout:emails') #21% null

In [77]:
# Consolidating any duplicate customer rows
df = df.groupby('revenue:email').first().reset_index()

In [78]:
# Renaming column names for anonymity
df = hidden.column_names(df)

In [79]:
# Pulling out test rows
tests = hidden.test_emails(df)

In [80]:
# Dropping all test rows from the df
df['test_emails'] = [x in tests for x in df['revenue:email']]
df = df[df['test_emails'] == False]
df.drop('test_emails', inplace=True, axis=1)

In [None]:
# Mapping customers
locations = df[['ga:latitude', 'ga:longitude']]
locationlist = locations.values.tolist()
locationlist = [x for x in locationlist if x[0]==x[0] and x[0]!=0.0]
cust_map = folium.Map()
for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point]).add_to(cust_map)
cust_map

In [160]:
# Splitting into train df and test df
df = df.iloc[np.random.permutation(df.shape[0])]
split_index = int(round((.3 * df.shape[0]), 0))
test_df = df.iloc[:split_index].copy()
train_df = df.iloc[split_index:].copy()

In [161]:
train_df.shape

(3062, 689)

In [None]:
# Checking percentage of null values
nulls = train_df.isnull().sum()/float(train_df.shape[0])

# Breaking into x and y's for bar graph
nulls = pd.DataFrame(nulls)
categories = list(nulls.index)
values = list(nulls[0])

# Forming bar graph in seaborn
fig = plt.figure(figsize=(15,100))
ax = fig.add_subplot(111)
ax.set_xlabel('Proportion of Null Values')
ax.set_title('Proportion of Null Values by Feature')
sns.barplot(y=categories, x=values, orient='h', ax=ax, palette='pastel')
plt.savefig('../images/null_values.png', dpi=300)

In [None]:
# Checking for identifier columns to drop
lst = []
for x in train_df.columns:
    if 'id' in x:
        lst.append(x)
    elif 'email' in x:
        lst.append(x)
    elif 'name' in x:
        lst.append(x)
lst

In [162]:
# Dropping all id and name variables as features
train_df.drop(['edd:first_name',
 'edd:last_name',
 'unnamed:_0_x',
 'unnamed:_0_y',
 'intercom:first_name',
 'intercom:last_name',
 'intercom:name',
 'intercom:anonymous_company_username',
 'intercom:username',
 'intercom:firstname',
 'intercom:lastname',
 'intercom:company_name',
 'drip:first_name',
 'drip:firstname',
 'drip:firstname',
 'drip:last_name',
 'drip:lastname',
 'drip:lastname',
 'drip:name',
 'drip:username',
 'hubcust:first_name',
 'hubcust:last_name',
 'hubcust:last_email_name',
 'hubcust:twitter_username',
 'hubcust:company_name',
 'hubcust:skype_username',
 'hubcomp:company_domain_name',
 'hubcomp:name',
 'edd:payment_id',
 'edd:customer_id',
 'edd:transaction_id',
 'ga:transactionid',
 'intercom:user_id',
 'intercom:stripe_id',
 'intercom:affiliate_id',
 'intercom:article_id',
 'intercom:company_id',
 'drip:id',
 'drip:id.1',
 'hubcust:contact_id',
 'hubcomp:company_id',
 'edd:payment_number',
 'edd:address',
 'edd:address_(line_2)',
 'edd:products_(verbose)',
 'edd:products_(raw)',
 'edd:tax_($)',
 'edd:purchase_key',
 'edd:user',
 'edd:ip_address',
 'ga:screenname', 'intercom:email', 'drip:email', 'hubcust:email', 'helpscout:firstname_x',
'helpscout:fullname_x', 'helpscout:lastname_x', 'helpscout:fullname_y', 
'helpscout:firstname_y', 'helpscout:lastname_y', 'revenue:id', 'revenue:user_id', 'revenue:email',
'revenue:purchase_count', 'revenue:payment_ids', 'revenue:domain', 'helpscout:id',
 'helpscout:emails'], inplace=True, axis=1)

In [163]:
# Drop any columns with a significant proportion of null values (store column names to do the same for test data)
null_dropped_cols = []
for col in train_df:
    try:
        if train_df[col].isnull().sum()/float(train_df.shape[0]) >= 0.80:
            null_dropped_cols.append(col)
            train_df.drop(col, axis=1, inplace=True)
        else:
            continue
    except:
        continue
null_dropped_cols

['revenue:notes',
 'ga:addestinationurl',
 'ga:addisplayurl',
 'ga:addistributionnetwork',
 'ga:adformat',
 'ga:adgroup',
 'ga:adkeywordmatchtype',
 'ga:admatchtype',
 'ga:admatchedquery',
 'ga:adplacementdomain',
 'ga:adplacementurl',
 'ga:adquerywordcount',
 'ga:adtargetingoption',
 'ga:adtargetingtype',
 'ga:adwordsadgroupid',
 'ga:adwordscampaignid',
 'ga:adwordscreativeid',
 'ga:adwordscriteriaid',
 'ga:adwordscustomerid',
 'ga:affiliation',
 'ga:browser',
 'ga:browsersize',
 'ga:browserversion',
 'ga:campaign',
 'ga:channelgrouping_x',
 'ga:checkoutoptions_x',
 'ga:city',
 'ga:cityid',
 'ga:continent',
 'ga:continentid',
 'ga:country',
 'ga:countryisocode',
 'ga:datasource',
 'ga:date_x',
 'ga:datehour_x',
 'ga:datehourminute_x',
 'ga:day_x',
 'ga:dayofweek_x',
 'ga:dayofweekname_x',
 'ga:dayssincelastsession',
 'ga:daystotransaction',
 'ga:devicecategory',
 'ga:exitpagepath',
 'ga:exitscreenname',
 'ga:flashversion_x',
 'ga:fullreferrer',
 'ga:hostname',
 'ga:hour_x',
 'ga:isowe

In [164]:
train_df.shape

(3062, 137)

In [165]:
# Identify and drop columns with only one value OR one value with a nan
one_value = []
for col in train_df.columns:
    try:
        if len(train_df[col].unique()) == 1:
            one_value.append(col)
            train_df.drop(col, inplace=True, axis=1)
        elif len(train_df[col].unique()) == 2:
            for x in train_df[col].unique():
                # Identifies nans
                if x != x:
                    one_value.append(col)
                    train_df.drop(col, inplace=True, axis=1)
                    break
    except:
        continue
one_value

['edd:mode_(live|test)',
 'edd:status',
 'edd:licenses_nan',
 'edd:payment_method_nan',
 'intercom:unsubscribed_from_emails',
 'intercom:marked_email_as_spam',
 'intercom:has_hard_bounced',
 'drip:status',
 'hubcust:broadcast_clicks',
 'hubcust:number_of_pageviews',
 'hubcust:facebook_clicks',
 'hubcust:google_plus_clicks',
 'hubcust:original_source',
 'hubcust:number_of_form_submissions',
 'hubcust:number_of_event_completions',
 'hubcust:average_pageviews',
 'hubcust:number_of_unique_forms_submitted',
 'hubcust:number_of_visits',
 'hubcust:linkedin_clicks',
 'hubcust:event_revenue',
 'hubcust:lifecycle_stage',
 'hubcust:twitter_clicks',
 'hubcomp:number_of_pageviews',
 'hubcomp:original_source_type',
 'hubcomp:number_of_visits']

In [166]:
train_df.shape

(3062, 112)

In [181]:
# Dropping other columns such as date columns or columns with leakage
train_df.drop(['ga:totalvalue_x', 'ga:totalvalue_y', 'ga:isoyear_x', 'ga:month_x', 'ga:year_x', 'drip:anonymous_company_key',
               'hubcomp:facebook_company_page', 'hubcomp:linkedin_bio', 'hubcust:associated_company', 'hubcomp:street_address',
               'hubcomp:description', 'helpscout:photourl_x','helpscout:photourl_y', 'helpscout:websites', 'edd:city',
               'edd:zip_/_postal_code', 'hubcomp:linkedin_company_page', 'hubcomp:web_technologies', 'helpscout:location_x',
              'ga:yearmonth_x', 'ga:isoyear_y', 'ga:month_y', 'ga:year_y', 'ga:yearmonth_y', 'drip:lifetime_value',
               'helpscout:location_y', 'helpscout:socialprofiles_links', 'helpscout:phones', 'helpscout:socialprofiles',
               'hubcomp:phone_number', 'edd:email', 'intercom:order_total', 'drip:purchase_count', 'edd:currency',
               'ga:continentid', 'ga:countryisocode', 'ga:hostname', 'ga:subcontientcode', 'edd:total_spent',
               'intercom:first_seen_(mdt)', 'intercom:signed_up_(mdt)', 'revenue:date_created', 'edd:date', 'intercom:signed_up_(mdt)',
               'intercom:last_seen_(mdt)', 'drip:created_at', 'drip:confirmed_at', 'hubcust:create_date', 'hubcust:last_modified_date',
               'hubcust:became_a_subscriber_date', 'hubcust:time_first_seen', 'hubcomp:last_modified_date', 'hubcomp:create_date',
               'hubcomp:first_contact_create_date', 'hubcomp:time_first_seen', 'helpscout:createdat_x', 'helpscout:modifiedat_x',
               'helpscout:createdat_y', 'helpscout:modifiedat_y', 'drip:landing_url', 'drip:referrer', 'hubcomp:website_url',
               'edd:licenses_license1', 'edd:licenses_license2', 'edd:licenses_license3', 'edd:licenses_no_license','revenue:name',
               'ga:fullreferrer', 'intercom:city', 'drip:ip_address', 'drip:anonymous_company_expiration',
               'hubcust:postal_code', 'hubcust:city', 'hubcomp:postal_code', 'hubcomp:city',
               'hubcomp:twitter_handle', 'drip:tags', 'helpscout:email_types'
              ], inplace=True, axis=1)

In [168]:
# Set numerical & categorical values for use later
numerical_vals = train_df.select_dtypes(exclude=['object', 'bool','datetime64'])                                     
categorical_vals = train_df.select_dtypes(include=['object', 'bool'])

In [95]:
# Looking at currently coded as numeric columns that only have a few values or have id in the name
# (to see if they should be categorical)
for x in numerical_vals.columns:
    if len(train_df[x].unique())<10:
        print(x)
        print(train_df[x].unique())
        print('\n\n')
    elif 'id' in x:
        print(x)

ga:metroid
edd:licenses_license1
[0.0 1.0 nan]



edd:licenses_license2
[1.0 0.0 nan]



edd:licenses_license3
[0.0 1.0 nan]



edd:licenses_no_license
[0.0 1.0 nan]



edd:payment_method_amazon
[0.0 nan 1.0]



edd:payment_method_paypal_express
[0.0 nan 1.0]



edd:payment_method_paypal_pro
[1.0 0.0 nan]



edd:payment_method_paypal_standard
[0.0 1.0 nan]



edd:payment_method_manual_purchases
[0.0 nan 1.0]



edd:payment_method_stripe
[0.0 1.0 nan]



drip:anonymous_company_activation_limit
[   3.  100.   nan    1.   10.    2.]



hubcomp:associated_contacts
[ nan   1.   2.   3.   0.]



hubcomp:annual_revenue
[             nan   1.00000000e+09   1.00000000e+06   1.00000000e+07
   5.00000000e+07   1.00000000e+08   2.50000000e+08   5.00000000e+08
   2.00000000e+08]





In [172]:
# Resetting numerical columns to objects as applicable
train_df[['edd:payment_method_paypal_pro',  'edd:payment_method_manual_purchases',
         'edd:payment_method_stripe', 'edd:payment_method_paypal_express', 'edd:payment_method_paypal_standard',
         'edd:payment_method_amazon', 'edd:payment_method_paypal_express', 'edd:payment_method_paypal_pro',
         'edd:payment_method_paypal_standard', 'edd:payment_method_manual_purchases', 'edd:payment_method_stripe']] = train_df[[
         'edd:payment_method_paypal_pro',  'edd:payment_method_manual_purchases',
         'edd:payment_method_stripe', 'edd:payment_method_paypal_express', 'edd:payment_method_paypal_standard', 
         'edd:payment_method_amazon', 'edd:payment_method_paypal_express', 'edd:payment_method_paypal_pro',
         'edd:payment_method_paypal_standard', 'edd:payment_method_manual_purchases', 'edd:payment_method_stripe']].astype(object)

In [173]:
# Set numerical & categorical values for use later
numerical_vals = train_df.select_dtypes(exclude=['object', 'bool','datetime64'])                                     
categorical_vals = train_df.select_dtypes(include=['object', 'bool'])

In [174]:
# Quick fill numerical null values
for col in numerical_vals:
    train_df[col] = train_df[col].fillna(train_df[col].mean())

In [None]:
# Violinplots of individual columns
bar = progressbar.ProgressBar()
for col in bar(numerical_vals):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    ax.set_title(col)
    sns.violinplot(x=train_df[col], orient='v', ax=ax, palette='pastel')
    text = '75th Percentile: {}\nMedian: {}\n25th Percentile: {}'.format(np.percentile(train_df[col], 75),\
            np.median(train_df[col]),np.percentile(train_df[col], 25))
    at = AnchoredText(text, prop=dict(size=15), frameon=True, loc=1)
    ax.add_artist(at)
    plt.savefig('../images/violinplot_{}'.format(col))

In [None]:
# Bar graphs of individual columns
bar = progressbar.ProgressBar()
for col in bar(categorical_vals):
    try:
        if len(train_df[col].unique())>100:
            continue
        else:
            height = len(train_df[col].unique())+10
            fig = plt.figure(figsize=(20, height))
            ax = fig.add_subplot(111)
            ax.set_title(col)
            sns.countplot(y=train_df[col], ax=ax, palette='pastel')
            plt.savefig('../images/bargraph_{}'.format(col))
    except:
        continue

In [None]:
# Violinplots of target by all object columns values
target = 'revenue:purchase_value'

# Violinplots of target by all object columns
bar = progressbar.ProgressBar()
for col in bar(categorical_vals):
    try:
        if len(train_df[col].unique())>100:
            continue
        else:
            height = len(train_df[col].unique())+10
            fig = plt.figure(figsize=(20, height))
            ax = fig.add_subplot(111)
            ax.set_title(col)
            sns.violinplot(y=train_df[col], x=train_df[target], orient='h', ax=ax, palette='pastel')
            plt.savefig('../images/{}_violinplot_{}'.format(target, col))
    except:
        continue

In [175]:
# Quick fill categorical null values
train_df = train_df.fillna('missing')

In [176]:
# Force stray unicode into strings
for column in categorical_vals.columns:
    train_df[column] = train_df[column].astype(str)

In [182]:
# Pickling data
df.to_pickle('../data/total_df_pickle')
train_df.to_pickle('../data/train_df_pickle')
test_df.to_pickle('../data/test_df_pickle')

train_df = pd.read_pickle('../data/train_df_pickle')

In [183]:
# Create dummy variables
for column in categorical_vals.columns:
    try:
        train_df = pd.get_dummies(train_df, drop_first=True, dummy_na=True, columns=[column])
    except:
        pass

In [185]:
# Pickling data
df.to_pickle('../data/total_df_pickle')
train_df.to_pickle('../data/train_df_pickle')
test_df.to_pickle('../data/test_df_pickle')

In [None]:
# Checking for correlated features
c = train_df.corr().abs()
s = c.unstack()
s.sort_values(ascending=False)