In [20]:
import matplotlib.pyplot as plt
import os
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
from scipy import stats


In [16]:
def load_df(csv_path='./data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    p=0.1
    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in JSON_COLUMNS},
                     dtype={'fullVisitorId': 'str'}, nrows=nrows, # Important!!
                      skiprows=lambda i: i > 0 and random.random() > p)

    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

train_df = load_df()
print("loaded")

Loaded train.csv. Shape: (90004, 54)
loaded


In [17]:
pd.set_option('display.max_columns', None)

train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
train_df['shops or not'] = train_df['totals.transactionRevenue'].values > 0

print(pd.value_counts(train_df['shops or not']))

def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day

date_format(train_df)

# excluded_features = [
#     'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue',
#     'visitId', 'visitStartTime', 'vis_date'
# ]
categorical_features = [
    _f for _f in train_df.columns
    if (train_df[_f].dtype == 'object')
]

#print(categorical_features)

for f in categorical_features:
    train_df[f], indexer = pd.factorize(train_df[f])

A=train_df.fillna(0)
X=A.drop('shops or not',axis=1)
L=X.drop('date',axis=1)
Z=L.drop('vis_date',axis=1)
M=Z.drop('totals.transactionRevenue',axis=1)
y=train_df['shops or not']


False    88853
True      1151
Name: shops or not, dtype: int64


In [18]:
from sklearn.model_selection import KFold, GroupKFold

folds = GroupKFold(n_splits=5)
oof_clf_preds = np.zeros(M.shape[0])

for fold_, (trn_, val_) in enumerate(folds.split(M, y, groups=M['fullVisitorId'])):
    trn_x, trn_y = M.iloc[trn_], y.iloc[trn_]
    val_x, val_y = M.iloc[val_], y.iloc[val_]


    DecTreeModel = DecisionTreeClassifier()
    DecTreeModel.fit(trn_x,trn_y)
    oof_clf_preds[val_] = DecTreeModel.predict_proba(val_x)[:,1]

print(classification_report(y, oof_clf_preds))    
print("prob")
print(M.shape)
print(oof_clf_preds.shape)


             precision    recall  f1-score   support

      False       0.99      0.99      0.99     88853
       True       0.29      0.31      0.30      1151

avg / total       0.98      0.98      0.98     90004

prob
(90004, 55)
(90004,)


In [22]:
M['non_zero_proba'] = oof_clf_preds
print(M.shape)
from sklearn.tree import DecisionTreeRegressor

oof_reg_preds = np.zeros(M.shape[0])

y1 = train_df["totals.transactionRevenue"].fillna(0)

for fold_, (trn_, val_) in enumerate(folds.split(M, y1, groups=M['fullVisitorId'])):
    trn_x, trn_y = M.iloc[trn_], y1.iloc[trn_]
    val_x, val_y = M.iloc[val_], y1.iloc[val_]

    DecTreeReg = DecisionTreeRegressor()
    DecTreeReg.fit(trn_x,trn_y)
    oof_reg_preds[val_] = DecTreeReg.predict(val_x)


print(stats.describe(y1))    
print(stats.describe(oof_reg_preds))


from sklearn.metrics import mean_squared_error    
mean_squared_error(np.log1p(y1), np.log1p(oof_reg_preds)) ** .5 


(90004, 56)
DescribeResult(nobs=90004, minmax=(0.0, 5614440000.0), mean=1462976.2010577307, variance=997153931554115.2, skewness=86.08508734383462, kurtosis=12488.317430445491)
DescribeResult(nobs=90004, minmax=(0.0, 3022200000.0), mean=1715277.2099017822, variance=996332883577032.0, skewness=48.544392880663395, kurtosis=3501.1210039267703)


2.471496162184628