In [78]:
import matplotlib.pyplot as plt
import os
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
from scipy import stats


In [79]:
def load_df(csv_path='./data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    p=0.1
    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in JSON_COLUMNS},
                     dtype={'fullVisitorId': 'str'}, nrows=nrows, # Important!!
                      skiprows=lambda i: i > 0 and random.random() > p)

    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

train_df = load_df()
print("loaded")


Loaded train.csv. Shape: (90463, 54)
loaded


In [80]:
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
train_df['shops or not'] = train_df['totals.transactionRevenue'].values > 0

print(pd.value_counts(train_df['shops or not']))

def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day
ec
date_format(train_df)


categorical_features = [
    _f for _f in train_df.columns
    if (train_df[_f].dtype == 'object')
]

#print(categorical_features)

for f in categorical_features:
    train_df[f], indexer = pd.factorize(train_df[f])

A=train_df.fillna(0)
X=A.drop('shops or not',axis=1)
L=X.drop('date',axis=1)
Z=L.drop('vis_date',axis=1)
#M=Z.drop('totals.transactionRevenue',axis=1)
M=Z.drop(['totals.transactionRevenue','socialEngagementType','totals.visits','device.browserVersion','device.flashVersion',
          'device.language','device.browserSize','device.mobileDeviceInfo','device.mobileDeviceMarketingName',
          'device.mobileDeviceModel','device.mobileInputSelector','device.operatingSystemVersion',
          'device.screenColors','device.screenResolution','geoNetwork.cityId','geoNetwork.latitude',
          'geoNetwork.longitude','geoNetwork.networkLocation','trafficSource.adwordsClickInfo.criteriaParameters'],axis=1)

y=train_df['shops or not']


False    89303
True      1160
Name: shops or not, dtype: int64


In [68]:
#implementing over sampling
# from imblearn.over_sampling import SMOTE

# sm = SMOTE(random_state=2)
# M_new,y_new = sm.fit_sample(M, y)

# print(M.shape)
# print(y.shape)
# print(M_new.shape)
# print(y_new.shape)


In [81]:
# feature selection
from sklearn.ensemble import ExtraTreesClassifier
# here criterion='entropy'  for the information gain, 'gini' for the Gini impurity 
model = ExtraTreesClassifier(criterion='entropy')
model.fit(M,y)

imp_df = pd.DataFrame()
imp_df['feature'] = M.columns
imp_df['importance'] = model.feature_importances_

print(imp_df.sort_values('importance', ascending=False))

imp_df = imp_df[imp_df['importance'].astype('float') < 0.0005] 

dropFeatures = imp_df['feature'].values 
print(dropFeatures)

M=M.drop(dropFeatures,axis=1)

                                         feature  importance
19                                   totals.hits    0.245532
21                              totals.pageviews    0.172312
18                                totals.bounces    0.078479
3                                        visitId    0.043439
5                                 visitStartTime    0.042364
1                                  fullVisitorId    0.041529
2                                      sessionId    0.041465
15                      geoNetwork.networkDomain    0.031453
17                       geoNetwork.subContinent    0.031095
20                              totals.newVisits    0.026020
11                               geoNetwork.city    0.025116
10                        device.operatingSystem    0.024839
14                              geoNetwork.metro    0.024162
16                             geoNetwork.region    0.023061
29                    trafficSource.isTrueDirect    0.020598
4                       

In [86]:
print(M.shape)

(90463, 34)


In [96]:
from sklearn.model_selection import KFold, GroupKFold
from imblearn.over_sampling import SMOTE


folds = GroupKFold(n_splits=5)
oof_clf_preds = np.zeros(M.shape[0])

for fold_, (trn_, val_) in enumerate(folds.split(M, y, groups=M['fullVisitorId'])):
    trn_x, trn_y = M.iloc[trn_], y.iloc[trn_]
    val_x, val_y = M.iloc[val_], y.iloc[val_]

#using smote     
#     sm = SMOTE(random_state=2)
#     trn_xSampled,trn_ySampled = sm.fit_sample(trn_x, trn_y)
 
#using class weight for imbalanced data
    DecTreeModel = DecisionTreeClassifier(class_weight={0:1,1:7})
    DecTreeModel.fit(trn_x,trn_y)
    oof_clf_preds[val_] = DecTreeModel.predict_proba(val_x)[:,1]

print(classification_report(y, oof_clf_preds))    
print("prob")
print(M.shape)
print(oof_clf_preds.shape)


              precision    recall  f1-score   support

       False       0.99      0.99      0.99     89303
        True       0.32      0.32      0.32      1160

   micro avg       0.98      0.98      0.98     90463
   macro avg       0.66      0.66      0.66     90463
weighted avg       0.98      0.98      0.98     90463

prob
(90463, 34)
(90463,)


In [85]:
M['non_zero_proba'] = oof_clf_preds
print(M.shape)
from sklearn.tree import DecisionTreeRegressor

oof_reg_preds = np.zeros(M.shape[0])

y1 = train_df["totals.transactionRevenue"].fillna(0)

for fold_, (trn_, val_) in enumerate(folds.split(M, y1, groups=M['fullVisitorId'])):
    trn_x, trn_y = M.iloc[trn_], y1.iloc[trn_]
    val_x, val_y = M.iloc[val_], y1.iloc[val_]
    
    DecTreeReg = DecisionTreeRegressor()
    DecTreeReg.fit(trn_x,trn_y)
    oof_reg_preds[val_] = DecTreeReg.predict(val_x)


print(stats.describe(y1))    
print(stats.describe(oof_reg_preds))


from sklearn.metrics import mean_squared_error    
mean_squared_error(np.log1p(y1), np.log1p(oof_reg_preds)) ** .5 


(90463, 34)
DescribeResult(nobs=90463, minmax=(0.0, 6826960000.0), mean=1691131.0701612814, variance=1880908061376809.5, skewness=85.12569106511955, kurtosis=10180.543444369127)
DescribeResult(nobs=90463, minmax=(0.0, 4272000000.0), mean=2162940.6497684135, variance=1974098170369087.2, skewness=55.78807037519845, kurtosis=4333.594858901213)


2.531384996418512