In [37]:
import gc
import re
import json
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [38]:
IN_TRAIN = 'in/train-grouped.csv'
IN_TEST = 'in/test-pruned.csv'

OUT_TRAIN = 'in/train-wrangled.csv'

In [39]:
train_df = pd.read_csv(IN_TRAIN, dtype={'fullVisitorId': 'str'}, low_memory=False)
test_df = pd.read_csv(IN_TEST, dtype={'fullVisitorId': 'str'}, low_memory=False).head()

In [40]:
LABEL = 'totals.transactionRevenue'

In [41]:
def true_false_cols(df, cols):
    for col in cols:
        df[col] = df[col].apply((lambda x: 1 if x == 'TRUE' else 0))
    return df

cols_to_true_false = ['device.isMobile', 'trafficSource.isTrueDirect',
                     'trafficSource.adwordsClickInfo.isVideoAd']

train_df_true_false = true_false_cols(train_df, cols_to_true_false)
test_df_true_false = true_false_cols(test_df, cols_to_true_false)

In [42]:
def one_hot_encode_col(df, col):
    one_hot_encoded_cols = pd.get_dummies(df[col], prefix=col, prefix_sep=' ')
    df_without_original_col = df.drop(col, axis=1)
    return pd.concat([df_without_original_col, one_hot_encoded_cols], axis=1, sort=False)

def one_hot_encode_cols(df, cols):
    for col in cols:
        df = one_hot_encode_col(df, col)
    return df

cols_to_one_hot_encode = ['channelGrouping', 'device.deviceCategory', 'device.operatingSystem',
                         'geoNetwork.continent', 'socialEngagementType']

In [43]:
train_df_onehot = one_hot_encode_cols(train_df_true_false, cols_to_one_hot_encode)
test_df_onehot = one_hot_encode_cols(test_df_true_false, cols_to_one_hot_encode)
train_df_onehot.head()

Unnamed: 0,visitNumber,geoNetwork.networkDomain,geoNetwork.city,trafficSource.isTrueDirect,geoNetwork.metro,trafficSource.adwordsClickInfo.isVideoAd,geoNetwork.region,fullVisitorId,trafficSource.adContent,visitId,...,device.operatingSystem Windows Phone,device.operatingSystem Xbox,device.operatingSystem iOS,geoNetwork.continent (not set),geoNetwork.continent Africa,geoNetwork.continent Americas,geoNetwork.continent Asia,geoNetwork.continent Europe,geoNetwork.continent Oceania,socialEngagementType Not Socially Engaged
0,1,comcast.net,not available in demo dataset,0,not available in demo dataset,0,not available in demo dataset,945784447887116,,1482427692,...,0,0,0,0,0,1,0,0,0,1
1,1,rdsnet.ro,not available in demo dataset,0,not available in demo dataset,0,not available in demo dataset,1841185112953535,,1482064870,...,0,0,0,0,0,0,0,1,0,1
2,16,telekom.hu,not available in demo dataset,0,not available in demo dataset,0,not available in demo dataset,2793999826216383,,1495661015,...,0,0,0,0,0,0,0,1,0,1
3,2,(not set),Mountain View,0,San Francisco-Oakland-San Jose CA,0,California,3758445103975072,,1498682107,...,0,0,0,0,0,1,0,0,0,1
4,1,(not set),not available in demo dataset,0,not available in demo dataset,0,not available in demo dataset,4251086898555900,,1475121793,...,0,0,1,0,0,1,0,0,0,1


In [44]:
cols_to_minmax_scale = ['date', 'visitId', 'visitNumber',
                        'visitStartTime', 'totals.bounces',
                       'totals.hits', 'totals.newVisits',
                       'totals.pageviews', 'totals.visits']
scaler = MinMaxScaler()

train_df_true_false[cols_to_minmax_scale] = scaler.fit_transform(train_df_true_false[cols_to_minmax_scale])
test_df_onehot[cols_to_minmax_scale] = scaler.fit_transform(test_df_onehot[cols_to_minmax_scale])
train_df_true_false.head()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [45]:
def one_hot_encode_col_by_distribution(df, col):
    THRESHOLD = 0.01

    value_counts = df[col].value_counts()
    max_value_count = value_counts.max()
    percentages = (value_counts / value_counts.max())
    bigger_than_threshold = percentages > THRESHOLD
    values_to_keep = (bigger_than_threshold[bigger_than_threshold == True]).keys()
    
    df[col] = df[col].apply(lambda x: x if x in values_to_keep else 'Other')
    
    one_hot_encoded_cols = pd.get_dummies(df[col], prefix=col, prefix_sep=' ')
    df_without_original_col = df.drop(col, axis=1)
    return pd.concat([df_without_original_col, one_hot_encoded_cols], axis=1, sort=False)

def one_hot_encode_cols_by_distribution(df, cols):
    for col in cols:
        df = one_hot_encode_col_by_distribution(df, col)
    return df

cols_to_embed = ['device.browser', 'geoNetwork.city',
                'geoNetwork.country', 'geoNetwork.networkDomain',
                'trafficSource.medium', 'geoNetwork.subContinent', 'trafficSource.source']

train_df_wrangled = one_hot_encode_cols_by_distribution(train_df_onehot, cols_to_embed)
test_df_wrangled = one_hot_encode_cols_by_distribution(test_df_onehot, cols_to_embed)
train_df_wrangled.head()

Unnamed: 0,visitNumber,trafficSource.isTrueDirect,geoNetwork.metro,trafficSource.adwordsClickInfo.isVideoAd,geoNetwork.region,fullVisitorId,trafficSource.adContent,visitId,totals.transactionRevenue,trafficSource.adwordsClickInfo.gclId,...,geoNetwork.subContinent Western Europe,trafficSource.source (direct),trafficSource.source Other,trafficSource.source Partners,trafficSource.source analytics.google.com,trafficSource.source dfa,trafficSource.source google,trafficSource.source google.com,trafficSource.source mall.googleplex.com,trafficSource.source youtube.com
0,1,0,not available in demo dataset,0,not available in demo dataset,945784447887116,,1482427692,0.0,,...,0,0,0,0,0,0,1,0,0,0
1,1,0,not available in demo dataset,0,not available in demo dataset,1841185112953535,,1482064870,0.0,,...,0,0,0,0,0,0,1,0,0,0
2,16,0,not available in demo dataset,0,not available in demo dataset,2793999826216383,,1495661015,0.0,,...,0,0,0,0,0,0,1,0,0,0
3,2,0,San Francisco-Oakland-San Jose CA,0,California,3758445103975072,,1498682107,0.0,,...,0,0,0,0,0,0,0,0,1,0
4,1,0,not available in demo dataset,0,not available in demo dataset,4251086898555900,,1475121793,0.0,,...,0,0,0,0,0,0,1,0,0,0


In [46]:
train_columns_except_label = [col for col in train_df_wrangled.columns.values if col != LABEL]
test_columns = test_df_wrangled.columns.values

common_cols = list(set(train_columns_except_label) & set(test_columns))
cols_to_remove_train = [col for col in train_columns_except_label if col not in common_cols]
cols_to_remove_test = [col for col in test_columns if col not in common_cols]

train_df_uniform = train_df_wrangled.drop(cols_to_remove_train, axis=1)
train_df_uniform = train_df_uniform[[c for c in train_df_uniform if c not in [LABEL]] + [LABEL]]
train_df_uniform.head()

Unnamed: 0,visitNumber,trafficSource.isTrueDirect,geoNetwork.metro,trafficSource.adwordsClickInfo.isVideoAd,geoNetwork.region,fullVisitorId,trafficSource.adContent,visitId,trafficSource.adwordsClickInfo.gclId,trafficSource.campaign,...,geoNetwork.country United States,geoNetwork.networkDomain (not set),geoNetwork.networkDomain rima-tde.net,trafficSource.medium organic,geoNetwork.subContinent Northern America,geoNetwork.subContinent Southeast Asia,geoNetwork.subContinent Southern Europe,geoNetwork.subContinent Western Europe,trafficSource.source google,totals.transactionRevenue
0,1,0,not available in demo dataset,0,not available in demo dataset,945784447887116,,1482427692,,(not set),...,1,0,0,1,1,0,0,0,1,0.0
1,1,0,not available in demo dataset,0,not available in demo dataset,1841185112953535,,1482064870,,(not set),...,0,0,0,1,0,0,0,0,1,0.0
2,16,0,not available in demo dataset,0,not available in demo dataset,2793999826216383,,1495661015,,(not set),...,0,0,0,1,0,0,0,0,1,0.0
3,2,0,San Francisco-Oakland-San Jose CA,0,California,3758445103975072,,1498682107,,(not set),...,1,1,0,0,1,0,0,0,0,0.0
4,1,0,not available in demo dataset,0,not available in demo dataset,4251086898555900,,1475121793,,(not set),...,1,1,0,1,1,0,0,0,1,0.0


In [47]:
%%time
train_df_uniform.to_csv(OUT_TRAIN, index=False)

CPU times: user 476 ms, sys: 4 ms, total: 480 ms
Wall time: 480 ms
