In [28]:
import gc
import re
import json
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [29]:
IN_TRAIN = 'in/train-grouped.csv'
IN_TEST = 'in/test-pruned.csv'

OUT_TRAIN = 'in/train-wrangled.csv'
OUT_TEST = 'in/test-wrangled.csv'

In [30]:
train_df = pd.read_csv(IN_TRAIN, dtype={'fullVisitorId': 'str'}, low_memory=False)
train_df.head()

Unnamed: 0,socialEngagementType,device.browser,totals.transactionRevenue,visitNumber,date,totals.hits,device.operatingSystem,totals.pageviews,totals.visits,sessionId,...,geoNetwork.continent,visitStartTime,device.isMobile,device.deviceCategory,geoNetwork.subContinent,visitId,totals.newVisits,geoNetwork.country,totals.bounces,trafficSource.medium
0,Not Socially Engaged,Chrome,0.0,1,20161222,5,Windows,5.0,1,0000945784447887116_1482427692,...,Americas,1482427692,False,desktop,Northern America,1482427692,1.0,United States,,organic
1,Not Socially Engaged,Chrome,0.0,1,20161218,2,Windows,2.0,1,0001841185112953535_1482064870,...,Europe,1482064870,False,desktop,Eastern Europe,1482064870,1.0,Romania,,organic
2,Not Socially Engaged,Chrome,0.0,16,20170524,1,Windows,1.0,1,0002793999826216383_1495661015,...,Europe,1495661015,False,desktop,Eastern Europe,1495661015,,Hungary,1.0,organic
3,Not Socially Engaged,Chrome,0.0,2,20170628,4,Linux,4.0,1,0003758445103975072_1498682107,...,Americas,1498682107,False,desktop,Northern America,1498682107,,United States,,referral
4,Not Socially Engaged,Safari,0.0,1,20160928,1,iOS,1.0,1,0004251086898555900_1475121793,...,Americas,1475121793,True,mobile,Northern America,1475121793,1.0,United States,1.0,organic


In [31]:
test_df = pd.read_csv(IN_TEST, dtype={'fullVisitorId': 'str'}, low_memory=False).head()
test_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,...,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.visits,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.isTrueDirect,trafficSource.medium,trafficSource.source
0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,1508151024,Chrome,desktop,...,Southeast Asia,,4,,4.0,1,,True,organic,google
1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,1508175522,Chrome,desktop,...,Southern Europe,,5,1.0,5.0,1,,,organic,google
2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,1508143220,Chrome,desktop,...,Western Europe,,7,1.0,7.0,1,,,organic,google
3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,1508193530,Safari,mobile,...,Northern America,,8,1.0,4.0,1,,,organic,google
4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,1508217442,Safari,desktop,...,Northern America,,9,1.0,4.0,1,,,organic,google


In [32]:
LABEL = 'totals.transactionRevenue'

In [33]:
def true_false_cols(df, cols):
    for col in cols:
        df[col] = df[col].apply((lambda x: 1 if x == 'TRUE' else 0))
    return df

cols_to_true_false = ['device.isMobile', 'trafficSource.isTrueDirect',
                     'trafficSource.adwordsClickInfo.isVideoAd']

train_df_true_false = true_false_cols(train_df, cols_to_true_false)
train_df_true_false.head()

Unnamed: 0,socialEngagementType,device.browser,totals.transactionRevenue,visitNumber,date,totals.hits,device.operatingSystem,totals.pageviews,totals.visits,sessionId,...,geoNetwork.continent,visitStartTime,device.isMobile,device.deviceCategory,geoNetwork.subContinent,visitId,totals.newVisits,geoNetwork.country,totals.bounces,trafficSource.medium
0,Not Socially Engaged,Chrome,0.0,1,20161222,5,Windows,5.0,1,0000945784447887116_1482427692,...,Americas,1482427692,0,desktop,Northern America,1482427692,1.0,United States,,organic
1,Not Socially Engaged,Chrome,0.0,1,20161218,2,Windows,2.0,1,0001841185112953535_1482064870,...,Europe,1482064870,0,desktop,Eastern Europe,1482064870,1.0,Romania,,organic
2,Not Socially Engaged,Chrome,0.0,16,20170524,1,Windows,1.0,1,0002793999826216383_1495661015,...,Europe,1495661015,0,desktop,Eastern Europe,1495661015,,Hungary,1.0,organic
3,Not Socially Engaged,Chrome,0.0,2,20170628,4,Linux,4.0,1,0003758445103975072_1498682107,...,Americas,1498682107,0,desktop,Northern America,1498682107,,United States,,referral
4,Not Socially Engaged,Safari,0.0,1,20160928,1,iOS,1.0,1,0004251086898555900_1475121793,...,Americas,1475121793,0,mobile,Northern America,1475121793,1.0,United States,1.0,organic


In [34]:
test_df_true_false = true_false_cols(test_df, cols_to_true_false)
test_df_true_false.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.deviceCategory,...,geoNetwork.subContinent,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.visits,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.isTrueDirect,trafficSource.medium,trafficSource.source
0,Organic Search,20171016,6167871330617112363,6167871330617112363_1508151024,Not Socially Engaged,1508151024,2,1508151024,Chrome,desktop,...,Southeast Asia,,4,,4.0,1,0,0,organic,google
1,Organic Search,20171016,643697640977915618,0643697640977915618_1508175522,Not Socially Engaged,1508175522,1,1508175522,Chrome,desktop,...,Southern Europe,,5,1.0,5.0,1,0,0,organic,google
2,Organic Search,20171016,6059383810968229466,6059383810968229466_1508143220,Not Socially Engaged,1508143220,1,1508143220,Chrome,desktop,...,Western Europe,,7,1.0,7.0,1,0,0,organic,google
3,Organic Search,20171016,2376720078563423631,2376720078563423631_1508193530,Not Socially Engaged,1508193530,1,1508193530,Safari,mobile,...,Northern America,,8,1.0,4.0,1,0,0,organic,google
4,Organic Search,20171016,2314544520795440038,2314544520795440038_1508217442,Not Socially Engaged,1508217442,1,1508217442,Safari,desktop,...,Northern America,,9,1.0,4.0,1,0,0,organic,google


In [35]:
def one_hot_encode_col(df, col):
    one_hot_encoded_cols = pd.get_dummies(df[col], prefix=col, prefix_sep=' ')
    df_without_original_col = df.drop(col, axis=1)
    return pd.concat([df_without_original_col, one_hot_encoded_cols], axis=1, sort=False)

def one_hot_encode_cols(df, cols):
    for col in cols:
        df = one_hot_encode_col(df, col)
    return df

cols_to_one_hot_encode = ['channelGrouping', 'device.deviceCategory', 'device.operatingSystem',
                         'geoNetwork.continent', 'socialEngagementType']

In [36]:
train_df_onehot = one_hot_encode_cols(train_df_true_false, cols_to_one_hot_encode)
train_df_onehot.head()

Unnamed: 0,device.browser,totals.transactionRevenue,visitNumber,date,totals.hits,totals.pageviews,totals.visits,sessionId,fullVisitorId,trafficSource.isTrueDirect,...,device.operatingSystem Windows Phone,device.operatingSystem Xbox,device.operatingSystem iOS,geoNetwork.continent (not set),geoNetwork.continent Africa,geoNetwork.continent Americas,geoNetwork.continent Asia,geoNetwork.continent Europe,geoNetwork.continent Oceania,socialEngagementType Not Socially Engaged
0,Chrome,0.0,1,20161222,5,5.0,1,0000945784447887116_1482427692,945784447887116,0,...,0,0,0,0,0,1,0,0,0,1
1,Chrome,0.0,1,20161218,2,2.0,1,0001841185112953535_1482064870,1841185112953535,0,...,0,0,0,0,0,0,0,1,0,1
2,Chrome,0.0,16,20170524,1,1.0,1,0002793999826216383_1495661015,2793999826216383,0,...,0,0,0,0,0,0,0,1,0,1
3,Chrome,0.0,2,20170628,4,4.0,1,0003758445103975072_1498682107,3758445103975072,0,...,0,0,0,0,0,1,0,0,0,1
4,Safari,0.0,1,20160928,1,1.0,1,0004251086898555900_1475121793,4251086898555900,0,...,0,0,1,0,0,1,0,0,0,1


In [37]:
test_df_onehot = one_hot_encode_cols(test_df_true_false, cols_to_one_hot_encode)
test_df_onehot.head()

Unnamed: 0,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.browser,device.isMobile,geoNetwork.city,geoNetwork.country,...,channelGrouping Organic Search,device.deviceCategory desktop,device.deviceCategory mobile,device.operatingSystem Macintosh,device.operatingSystem Windows,device.operatingSystem iOS,geoNetwork.continent Americas,geoNetwork.continent Asia,geoNetwork.continent Europe,socialEngagementType Not Socially Engaged
0,20171016,6167871330617112363,6167871330617112363_1508151024,1508151024,2,1508151024,Chrome,0,(not set),Singapore,...,1,1,0,1,0,0,0,1,0,1
1,20171016,643697640977915618,0643697640977915618_1508175522,1508175522,1,1508175522,Chrome,0,Zaragoza,Spain,...,1,1,0,0,1,0,0,0,1,1
2,20171016,6059383810968229466,6059383810968229466_1508143220,1508143220,1,1508143220,Chrome,0,not available in demo dataset,France,...,1,1,0,1,0,0,0,0,1,1
3,20171016,2376720078563423631,2376720078563423631_1508193530,1508193530,1,1508193530,Safari,0,Mountain View,United States,...,1,0,1,0,0,1,1,0,0,1
4,20171016,2314544520795440038,2314544520795440038_1508217442,1508217442,1,1508217442,Safari,0,San Jose,United States,...,1,1,0,1,0,0,1,0,0,1


In [39]:
cols_to_minmax_scale = ['date', 'visitId', 'visitNumber',
                        'visitStartTime', 'totals.bounces',
                       'totals.hits', 'totals.newVisits',
                       'totals.pageviews', 'totals.visits']


scaler = MinMaxScaler()

train_df_onehot[cols_to_minmax_scale] = scaler.fit_transform(train_df_onehot[cols_to_minmax_scale])
train_df_onehot.head()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [12]:
test_df_onehot[cols_to_minmax_scale] = scaler.fit_transform(test_df_onehot[cols_to_minmax_scale])
test_df_onehot.head()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [13]:
def one_hot_encode_col_by_distribution(df, col):
    THRESHOLD = 0.01

    value_counts = df[col].value_counts()
    max_value_count = value_counts.max()
    percentages = (value_counts / value_counts.max())
    bigger_than_threshold = percentages > THRESHOLD
    values_to_keep = (bigger_than_threshold[bigger_than_threshold == True]).keys()
    
    df[col] = df[col].apply(lambda x: x if x in values_to_keep else 'Other')
    
    one_hot_encoded_cols = pd.get_dummies(df[col], prefix=col, prefix_sep=' ')
    df_without_original_col = df.drop(col, axis=1)
    return pd.concat([df_without_original_col, one_hot_encoded_cols], axis=1, sort=False)

def one_hot_encode_cols_by_distribution(df, cols):
    for col in cols:
        df = one_hot_encode_col_by_distribution(df, col)
    return df

cols_to_embed = ['device.browser', 'geoNetwork.city',
                'geoNetwork.country', 'geoNetwork.networkDomain',
                'trafficSource.medium', 'geoNetwork.subContinent', 'trafficSource.source']

train_df_wrangled = one_hot_encode_cols_by_distribution(train_df_onehot, cols_to_embed)
train_df_wrangled.head()

Unnamed: 0,visitNumber,trafficSource.isTrueDirect,geoNetwork.metro,trafficSource.adwordsClickInfo.isVideoAd,geoNetwork.region,fullVisitorId,trafficSource.adContent,visitId,totals.transactionRevenue,trafficSource.adwordsClickInfo.gclId,...,geoNetwork.subContinent Western Europe,trafficSource.source (direct),trafficSource.source Other,trafficSource.source Partners,trafficSource.source analytics.google.com,trafficSource.source dfa,trafficSource.source google,trafficSource.source google.com,trafficSource.source mall.googleplex.com,trafficSource.source youtube.com
0,0.0,0,not available in demo dataset,0,not available in demo dataset,945784447887116,,0.391926,0.0,,...,0,0,0,0,0,0,1,0,0,0
1,0.0,0,not available in demo dataset,0,not available in demo dataset,1841185112953535,,0.380451,0.0,,...,0,0,0,0,0,0,1,0,0,0
2,0.029183,0,not available in demo dataset,0,not available in demo dataset,2793999826216383,,0.81044,0.0,,...,0,0,0,0,0,0,1,0,0,0
3,0.001946,0,San Francisco-Oakland-San Jose CA,0,California,3758445103975072,,0.905984,0.0,,...,0,0,0,0,0,0,0,0,1,0
4,0.0,0,not available in demo dataset,0,not available in demo dataset,4251086898555900,,0.160871,0.0,,...,0,0,0,0,0,0,1,0,0,0


In [14]:
test_df_wrangled = one_hot_encode_cols_by_distribution(test_df_onehot, cols_to_embed)
test_df_wrangled.head()

Unnamed: 0,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.isMobile,geoNetwork.metro,geoNetwork.region,totals.bounces,...,geoNetwork.networkDomain (not set),geoNetwork.networkDomain myrepublic.com.sg,geoNetwork.networkDomain rima-tde.net,geoNetwork.networkDomain sfr.net,trafficSource.medium organic,geoNetwork.subContinent Northern America,geoNetwork.subContinent Southeast Asia,geoNetwork.subContinent Southern Europe,geoNetwork.subContinent Western Europe,trafficSource.source google
0,20171016,6167871330617112363,6167871330617112363_1508151024,1508151024,2,1508151024,0,(not set),(not set),,...,0,1,0,0,1,0,1,0,0,1
1,20171016,643697640977915618,0643697640977915618_1508175522,1508175522,1,1508175522,0,(not set),Aragon,,...,0,0,1,0,1,0,0,1,0,1
2,20171016,6059383810968229466,6059383810968229466_1508143220,1508143220,1,1508143220,0,not available in demo dataset,not available in demo dataset,,...,0,0,0,1,1,0,0,0,1,1
3,20171016,2376720078563423631,2376720078563423631_1508193530,1508193530,1,1508193530,0,San Francisco-Oakland-San Jose CA,California,,...,1,0,0,0,1,1,0,0,0,1
4,20171016,2314544520795440038,2314544520795440038_1508217442,1508217442,1,1508217442,0,San Francisco-Oakland-San Jose CA,California,,...,1,0,0,0,1,1,0,0,0,1


In [15]:
train_columns_except_label = [col for col in train_df_wrangled.columns.values if col != LABEL]
test_columns = test_df_wrangled.columns.values

common_cols = list(set(train_columns_except_label) & set(test_columns))
cols_to_remove_train = [col for col in train_columns_except_label if col not in common_cols]
cols_to_remove_test = [col for col in test_columns if col not in common_cols]

train_df_uniform = train_df_wrangled.drop(cols_to_remove_train, axis=1)
train_df_uniform = train_df_uniform[[c for c in train_df_uniform if c not in [LABEL]] + [LABEL]]
train_df_uniform.head()

Unnamed: 0,visitNumber,trafficSource.isTrueDirect,geoNetwork.metro,trafficSource.adwordsClickInfo.isVideoAd,geoNetwork.region,fullVisitorId,trafficSource.adContent,visitId,trafficSource.adwordsClickInfo.gclId,trafficSource.campaign,...,geoNetwork.country United States,geoNetwork.networkDomain (not set),geoNetwork.networkDomain rima-tde.net,trafficSource.medium organic,geoNetwork.subContinent Northern America,geoNetwork.subContinent Southeast Asia,geoNetwork.subContinent Southern Europe,geoNetwork.subContinent Western Europe,trafficSource.source google,totals.transactionRevenue
0,0.0,0,not available in demo dataset,0,not available in demo dataset,945784447887116,,0.391926,,(not set),...,1,0,0,1,1,0,0,0,1,0.0
1,0.0,0,not available in demo dataset,0,not available in demo dataset,1841185112953535,,0.380451,,(not set),...,0,0,0,1,0,0,0,0,1,0.0
2,0.029183,0,not available in demo dataset,0,not available in demo dataset,2793999826216383,,0.81044,,(not set),...,0,0,0,1,0,0,0,0,1,0.0
3,0.001946,0,San Francisco-Oakland-San Jose CA,0,California,3758445103975072,,0.905984,,(not set),...,1,1,0,0,1,0,0,0,0,0.0
4,0.0,0,not available in demo dataset,0,not available in demo dataset,4251086898555900,,0.160871,,(not set),...,1,1,0,1,1,0,0,0,1,0.0


In [16]:
test_df_uniform = test_df_wrangled.drop(cols_to_remove_test, axis=1)
test_df_uniform.head()

Unnamed: 0,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.isMobile,geoNetwork.metro,geoNetwork.region,totals.bounces,...,geoNetwork.country Spain,geoNetwork.country United States,geoNetwork.networkDomain (not set),geoNetwork.networkDomain rima-tde.net,trafficSource.medium organic,geoNetwork.subContinent Northern America,geoNetwork.subContinent Southeast Asia,geoNetwork.subContinent Southern Europe,geoNetwork.subContinent Western Europe,trafficSource.source google
0,20171016,6167871330617112363,6167871330617112363_1508151024,1508151024,2,1508151024,0,(not set),(not set),,...,0,0,0,0,1,0,1,0,0,1
1,20171016,643697640977915618,0643697640977915618_1508175522,1508175522,1,1508175522,0,(not set),Aragon,,...,1,0,0,1,1,0,0,1,0,1
2,20171016,6059383810968229466,6059383810968229466_1508143220,1508143220,1,1508143220,0,not available in demo dataset,not available in demo dataset,,...,0,0,0,0,1,0,0,0,1,1
3,20171016,2376720078563423631,2376720078563423631_1508193530,1508193530,1,1508193530,0,San Francisco-Oakland-San Jose CA,California,,...,0,1,1,0,1,1,0,0,0,1
4,20171016,2314544520795440038,2314544520795440038_1508217442,1508217442,1,1508217442,0,San Francisco-Oakland-San Jose CA,California,,...,0,1,1,0,1,1,0,0,0,1


In [17]:
%%time
train_df_uniform.to_csv(OUT_TRAIN, index=False)
test_df_uniform.to_csv(OUT_TEST, index=False)

CPU times: user 680 ms, sys: 24 ms, total: 704 ms
Wall time: 738 ms
