https://www.analyticsvidhya.com/blog/2017/03/imbalanced-classification-problem/

https://www.kaggle.com/qianchao/smote-with-imbalance-data

Try cluster over sampling

Replicating over sampling

Informed over sampling

Modified synthetic minority oversampling technique (MSMOTE)

Informed over sampling

Advantages

- Mitigates the problem of overfitting caused by random oversampling as synthetic examples are generated rather than replication of instances
- No loss of useful information


Disadvantages

- While generating synthetic examples SMOTE does not take into consideration neighboring examples from other classes. This can result in increase in overlapping of classes and can introduce additional noise

- SMOTE is not very effective for high dimensional data

In [31]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

# for LR
import statsmodels.api as sm
from sklearn.model_selection import KFold
import random

# for logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# for Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# for feature eng
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import nltk
from nltk.corpus import stopwords

In [32]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Read Clean Data

In [33]:
%%time
clean_train_data = pd.read_csv('/Users/josephcolaco/customer_revenue_prediction/data/cleaned_feat_eng_train_data.csv')
print('Shape of train data is',clean_train_data.shape)



Shape of train data is (902755, 178)
CPU times: user 21.3 s, sys: 2.7 s, total: 24 s
Wall time: 23.7 s


# Minimal Prepping for Modeling

In [34]:
clean_train_data.isnull().sum().sum()

0

In [35]:
id_col = clean_train_data['fullVisitorId']
clean_train_data_v1 = clean_train_data.drop(columns = ['fullVisitorId', 'sessionId', 'Unnamed: 0'])

In [36]:
clean_train_data['fullVisitorId'].nunique()

716877

In [19]:
id_col.nunique()

716877

In [8]:
clean_train_data_v1.head()

Unnamed: 0,channelGrouping,visitNumber,deviceCategory,isMobile,continent,bounces,hits,newVisits,pageviews,campaignCode,adNetworkType,isVideoAd,page,slot,visitHour,dayNameDate,monthDate,yearDate,dayDate,domain_(not_set),domain_.us,domain_.net,domain_.com,domain_.edu,domain_.ca,domain_.org,domain_.mx,os_Chrome OS,os_Macintosh,os_Linux,os_iOS,os_Windows,adContent_Google Merchandise Collection,adContent_(not_set),browser_Chrome,browser_Firefox,browser_Internet Explorer,browser_Edge,country_United States,country_Venezuela,country_Puerto Rico,country_Canada,city_Maracaibo,city_Ann Arbor,city_Cambridge,city_San Bruno,city_Chicago,city_Austin,city_Irvine,city_New York,city_Nashville,city_Jersey City,city_Boulder,city_Kirkland,city_Seattle,city_Oakland,city_Denver,city_Sunnyvale,city_San Francisco,city_Pittsburgh,city_Washington,city_Atlanta,city_Los Angeles,city_Mountain View,city_Minneapolis,city_San Antonio,city_Lake Oswego,city_Santa Clara,city_Cupertino,city_Salem,city_San Mateo,city_San Diego,city_Palo Alto,city_Fremont,city_Houston,city_Milpitas,city_Boston,city_Charlotte,city_San Jose,city_Philadelphia,city_Redwood City,city_Portland,city_Phoenix,city_(not_set),city_Toronto,city_Dallas,subContinent_Northern America,subContinent_Caribbean,campaign_AW - Dynamic Search Ads Whole Site,campaign_AW - Accessories,campaign_(not_set),region_Zulia,region_Nebraska,region_Michigan,region_Pichincha,region_Tennessee,region_Illinois,region_New York,region_Washington,region_Massachusetts,region_Colorado,region_South Carolina,region_Texas,region_Georgia,region_Missouri,region_Iowa,region_District of Columbia,region_California,region_Minnesota,region_Utah,region_Arizona,region_Pennsylvania,region_New Jersey,region_Indiana,region_Florida,region_Maryland,region_Connecticut,region_North Carolina,region_Virginia,region_Nevada,region_Ohio,region_Alberta,region_(not_set),region_Ontario,region_Oregon,source_mall.googleplex.com,source_dealspotr.com,source_mail.google.com,source_groups.google.com,source_phandroid.com,source_gdeals.googleplex.com,source_dfa,source_l.facebook.com,source_yahoo,source_google,source_bing,source_sites.google.com,source_(direct),source_facebook.com,referralPath_deal,referralPath_sign,referralPath_google,referralPath_merchandise,referralPath_store,referralPath_emails,referralPath_special,referralPath_coup,referralPath_stor,referralPath_mail,referralPath_com,referralPath_forum,referralPath_merch,referralPath_new,referralPath_url,referralPath_site,referralPath_mountain,referralPath_view,referralPath_php,referralPath_offer,referralPath_googletopia,referralPath_free,referralPath_stuff,referralPath_alphabet,referralPath_discount,keyword_qehscssdk,keyword_googl,keyword_merchandis,keyword_store,keyword_hzbaqlcbjwfgoh,keyword_remarket,keyword_content,keyword_target,keyword_zknv
0,Organic Search,1,desktop,False,Asia,1,1,1,1,(not_set),(not_set),True,0,(not_set),15,Friday,9,2016,2,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Organic Search,1,desktop,False,Oceania,1,1,1,1,(not_set),(not_set),True,0,(not_set),5,Friday,9,2016,2,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Organic Search,1,desktop,False,Europe,1,1,1,1,(not_set),(not_set),True,0,(not_set),1,Friday,9,2016,2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Organic Search,1,desktop,False,Asia,1,1,1,1,(not_set),(not_set),True,0,(not_set),5,Friday,9,2016,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,Organic Search,2,mobile,True,Europe,1,1,0,1,(not_set),(not_set),True,0,(not_set),13,Friday,9,2016,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
clean_train_data_v2 = pd.get_dummies(clean_train_data_v1)

In [10]:
clean_train_data_v2['isMobile'] = clean_train_data_v2['isMobile'].astype('int64')
clean_train_data_v2['isVideoAd'] = clean_train_data_v2['isVideoAd'].astype('int64')

In [12]:
clean_train_data_v3 = pd.concat([id_col, clean_train_data_v2], axis = 1)

In [24]:
clean_train_data_v3.shape

(902755, 199)

In [15]:
clean_train_data_v3.head()

Unnamed: 0,fullVisitorId,visitNumber,isMobile,bounces,hits,newVisits,pageviews,isVideoAd,page,visitHour,monthDate,yearDate,dayDate,domain_(not_set),domain_.us,domain_.net,domain_.com,domain_.edu,domain_.ca,domain_.org,domain_.mx,os_Chrome OS,os_Macintosh,os_Linux,os_iOS,os_Windows,adContent_Google Merchandise Collection,adContent_(not_set),browser_Chrome,browser_Firefox,browser_Internet Explorer,browser_Edge,country_United States,country_Venezuela,country_Puerto Rico,country_Canada,city_Maracaibo,city_Ann Arbor,city_Cambridge,city_San Bruno,city_Chicago,city_Austin,city_Irvine,city_New York,city_Nashville,city_Jersey City,city_Boulder,city_Kirkland,city_Seattle,city_Oakland,city_Denver,city_Sunnyvale,city_San Francisco,city_Pittsburgh,city_Washington,city_Atlanta,city_Los Angeles,city_Mountain View,city_Minneapolis,city_San Antonio,city_Lake Oswego,city_Santa Clara,city_Cupertino,city_Salem,city_San Mateo,city_San Diego,city_Palo Alto,city_Fremont,city_Houston,city_Milpitas,city_Boston,city_Charlotte,city_San Jose,city_Philadelphia,city_Redwood City,city_Portland,city_Phoenix,city_(not_set),city_Toronto,city_Dallas,subContinent_Northern America,subContinent_Caribbean,campaign_AW - Dynamic Search Ads Whole Site,campaign_AW - Accessories,campaign_(not_set),region_Zulia,region_Nebraska,region_Michigan,region_Pichincha,region_Tennessee,region_Illinois,region_New York,region_Washington,region_Massachusetts,region_Colorado,region_South Carolina,region_Texas,region_Georgia,region_Missouri,region_Iowa,region_District of Columbia,region_California,region_Minnesota,region_Utah,region_Arizona,region_Pennsylvania,region_New Jersey,region_Indiana,region_Florida,region_Maryland,region_Connecticut,region_North Carolina,region_Virginia,region_Nevada,region_Ohio,region_Alberta,region_(not_set),region_Ontario,region_Oregon,source_mall.googleplex.com,source_dealspotr.com,source_mail.google.com,source_groups.google.com,source_phandroid.com,source_gdeals.googleplex.com,source_dfa,source_l.facebook.com,source_yahoo,source_google,source_bing,source_sites.google.com,source_(direct),source_facebook.com,referralPath_deal,referralPath_sign,referralPath_google,referralPath_merchandise,referralPath_store,referralPath_emails,referralPath_special,referralPath_coup,referralPath_stor,referralPath_mail,referralPath_com,referralPath_forum,referralPath_merch,referralPath_new,referralPath_url,referralPath_site,referralPath_mountain,referralPath_view,referralPath_php,referralPath_offer,referralPath_googletopia,referralPath_free,referralPath_stuff,referralPath_alphabet,referralPath_discount,keyword_qehscssdk,keyword_googl,keyword_merchandis,keyword_store,keyword_hzbaqlcbjwfgoh,keyword_remarket,keyword_content,keyword_target,keyword_zknv,channelGrouping_(Other),channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,channelGrouping_Organic Search,channelGrouping_Paid Search,channelGrouping_Referral,channelGrouping_Social,deviceCategory_desktop,deviceCategory_mobile,deviceCategory_tablet,continent_(not_set),continent_Africa,continent_Americas,continent_Asia,continent_Europe,continent_Oceania,campaignCode_(not_set),campaignCode_11251kjhkvahf,adNetworkType_(not_set),adNetworkType_Google Search,adNetworkType_Search partners,slot_(not_set),slot_RHS,slot_Top,dayNameDate_Friday,dayNameDate_Monday,dayNameDate_Saturday,dayNameDate_Sunday,dayNameDate_Thursday,dayNameDate_Tuesday,dayNameDate_Wednesday
0,1131660440785968503,1,0,1,1,1,1,1,0,15,9,2016,2,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0
1,377306020877927890,1,0,1,1,1,1,1,0,5,9,2016,2,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0
2,3895546263509774583,1,0,1,1,1,1,1,0,1,9,2016,2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0
3,4763447161404445595,1,0,1,1,1,1,1,0,5,9,2016,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0
4,27294437909732085,2,1,1,1,0,1,1,0,13,9,2016,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0


In [27]:
cols = clean_train_data_v3.columns
clean_train_data_agg = clean_train_data_v3.groupby(['fullVisitorId'])[cols].sum()

In [None]:
max_cols = ['visitNumber']
mean_cols = ['visitHour', 'monthDate', 'yearDate', 'dayDate']

In [28]:
clean_train_data_agg.head()

Unnamed: 0_level_0,visitNumber,isMobile,bounces,hits,newVisits,pageviews,isVideoAd,page,visitHour,monthDate,yearDate,dayDate,domain_(not_set),domain_.us,domain_.net,domain_.com,domain_.edu,domain_.ca,domain_.org,domain_.mx,os_Chrome OS,os_Macintosh,os_Linux,os_iOS,os_Windows,adContent_Google Merchandise Collection,adContent_(not_set),browser_Chrome,browser_Firefox,browser_Internet Explorer,browser_Edge,country_United States,country_Venezuela,country_Puerto Rico,country_Canada,city_Maracaibo,city_Ann Arbor,city_Cambridge,city_San Bruno,city_Chicago,city_Austin,city_Irvine,city_New York,city_Nashville,city_Jersey City,city_Boulder,city_Kirkland,city_Seattle,city_Oakland,city_Denver,city_Sunnyvale,city_San Francisco,city_Pittsburgh,city_Washington,city_Atlanta,city_Los Angeles,city_Mountain View,city_Minneapolis,city_San Antonio,city_Lake Oswego,city_Santa Clara,city_Cupertino,city_Salem,city_San Mateo,city_San Diego,city_Palo Alto,city_Fremont,city_Houston,city_Milpitas,city_Boston,city_Charlotte,city_San Jose,city_Philadelphia,city_Redwood City,city_Portland,city_Phoenix,city_(not_set),city_Toronto,city_Dallas,subContinent_Northern America,subContinent_Caribbean,campaign_AW - Dynamic Search Ads Whole Site,campaign_AW - Accessories,campaign_(not_set),region_Zulia,region_Nebraska,region_Michigan,region_Pichincha,region_Tennessee,region_Illinois,region_New York,region_Washington,region_Massachusetts,region_Colorado,region_South Carolina,region_Texas,region_Georgia,region_Missouri,region_Iowa,region_District of Columbia,region_California,region_Minnesota,region_Utah,region_Arizona,region_Pennsylvania,region_New Jersey,region_Indiana,region_Florida,region_Maryland,region_Connecticut,region_North Carolina,region_Virginia,region_Nevada,region_Ohio,region_Alberta,region_(not_set),region_Ontario,region_Oregon,source_mall.googleplex.com,source_dealspotr.com,source_mail.google.com,source_groups.google.com,source_phandroid.com,source_gdeals.googleplex.com,source_dfa,source_l.facebook.com,source_yahoo,source_google,source_bing,source_sites.google.com,source_(direct),source_facebook.com,referralPath_deal,referralPath_sign,referralPath_google,referralPath_merchandise,referralPath_store,referralPath_emails,referralPath_special,referralPath_coup,referralPath_stor,referralPath_mail,referralPath_com,referralPath_forum,referralPath_merch,referralPath_new,referralPath_url,referralPath_site,referralPath_mountain,referralPath_view,referralPath_php,referralPath_offer,referralPath_googletopia,referralPath_free,referralPath_stuff,referralPath_alphabet,referralPath_discount,keyword_qehscssdk,keyword_googl,keyword_merchandis,keyword_store,keyword_hzbaqlcbjwfgoh,keyword_remarket,keyword_content,keyword_target,keyword_zknv,channelGrouping_(Other),channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,channelGrouping_Organic Search,channelGrouping_Paid Search,channelGrouping_Referral,channelGrouping_Social,deviceCategory_desktop,deviceCategory_mobile,deviceCategory_tablet,continent_(not_set),continent_Africa,continent_Americas,continent_Asia,continent_Europe,continent_Oceania,campaignCode_(not_set),campaignCode_11251kjhkvahf,adNetworkType_(not_set),adNetworkType_Google Search,adNetworkType_Search partners,slot_(not_set),slot_RHS,slot_Top,dayNameDate_Friday,dayNameDate_Monday,dayNameDate_Saturday,dayNameDate_Sunday,dayNameDate_Thursday,dayNameDate_Tuesday,dayNameDate_Wednesday
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1
4823595352351,1,0,1,1,1,1,1,0,14,11,2016,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5103959234087,1,1,0,10,1,8,1,0,22,8,2016,21,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10278554503158,1,0,0,11,1,8,1,0,5,10,2016,20,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
20424342248747,1,0,0,17,1,13,1,0,7,11,2016,30,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26722803385797,1,0,0,3,1,2,1,0,10,6,2017,5,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [22]:
clean_train_data_agg = (clean_train_data_v3
                        .groupby(['fullVisitorId'])
                        .agg({'visitNumber': 'sum',
                             'isMobile': 'sum',
                             'bounces': 'saum',
                             'hits': 'sum',
                             'newVisits': 'sum'}))

In [23]:
clean_train_data_agg.head()

Unnamed: 0_level_0,visitNumber,isMobile,bounces
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4823595352351,1,0,1
5103959234087,1,1,0
10278554503158,1,0,0
20424342248747,1,0,0
26722803385797,1,0,0


In [17]:
clean_train_data_agg.shape

(716877, 1)

In [18]:
clean_train_data_agg.head()

Unnamed: 0_level_0,isMobile
fullVisitorId,Unnamed: 1_level_1
4823595352351,0
5103959234087,1
10278554503158,0
20424342248747,0
26722803385797,0


In [28]:
scaler = MinMaxScaler()
scaled_clean_train_data_v2 = scaler.fit_transform(clean_train_data_v2)
3pd.DataFrame(scaled_clean_train_data_v2).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197
0,0.0,0.0,1.0,0.0,1.0,0.002132,1.0,0.0,0.652174,0.727273,0.0,0.033333,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.002132,1.0,0.0,0.217391,0.727273,0.0,0.033333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.002132,1.0,0.0,0.043478,0.727273,0.0,0.033333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.002132,1.0,0.0,0.217391,0.727273,0.0,0.033333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.002538,1.0,1.0,0.0,0.0,0.002132,1.0,0.0,0.565217,0.727273,0.0,0.033333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# SMOTE on Subset of Train Data