In [7]:
import matplotlib.pyplot as plt
import os
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report




def load_df(csv_path='./data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    p=0.1
    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in JSON_COLUMNS},
                     dtype={'fullVisitorId': 'str'}, nrows=nrows, # Important!!
                      skiprows=lambda i: i > 0 and random.random() > p)

    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

train_df = load_df()
pd.set_option('display.max_columns', None)
# print(train_df.head())
# shops_or_not=lambda x : x.train_df.totals.transactionRevenue > 0
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
train_df['shops or not'] = train_df['totals.transactionRevenue'].values > 0
# y_clf = (train_df['totals.transactionRevenue'].fillna(0) > 0).astype(np.uint8)
print(pd.value_counts(train_df['shops or not']))
# print(pd.value_counts(y_clf))

def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day

date_format(train_df)

# excluded_features = [
#     'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue',
#     'visitId', 'visitStartTime', 'vis_date'
# ]
categorical_features = [
    _f for _f in train_df.columns
    if (train_df[_f].dtype == 'object')
]

#print(categorical_features)

for f in categorical_features:
    train_df[f], indexer = pd.factorize(train_df[f])

A=train_df.fillna(0)
X=A.drop('shops or not',axis=1)
L=X.drop('date',axis=1)
Z=L.drop('vis_date',axis=1)


M=Z.drop('totals.transactionRevenue',axis=1)
y=train_df['shops or not']

# print(X)
X_train, X_test, y_train, y_test = train_test_split(M, y, test_size=0.20, random_state=101)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

DecTreeModel = DecisionTreeClassifier()
DecTreeModel.fit(X_train,y_train)

predictions = DecTreeModel.predict(X_test)

print(classification_report(y_test, predictions))

#from sklearn.model_selection import cross_val_score

#accuracy = cross_val_score(DecTreeModel, M, y, cv=5,scoring='accuracy')
#print('Accuracy : ', np.mean(accuracy))
#recall = cross_val_score(DecTreeModel, M, y, cv=5,scoring='recall')
##print(scores_final)
#print('Precision : ', np.mean(recall))

from sklearn.tree import DecisionTreeRegressor

y1 = train_df["totals.transactionRevenue"].fillna(0)
X_train1, X_test1, y_train1, y_test1 = train_test_split(M, y1, test_size=0.20, random_state=101)

#scaler = StandardScaler()
#scaler.fit(X_train1)
#X_train1 = scaler.transform(X_train1)
#X_test1 = scaler.transform(X_test1)

DecTreeReg = DecisionTreeRegressor()
DecTreeReg.fit(X_train1,y_train1)

predictions1 = DecTreeReg.predict(X_test1)

from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test1, predictions1))


Loaded train.csv. Shape: (90405, 54)
False    89317
True      1088
Name: shops or not, dtype: int64
             precision    recall  f1-score   support

      False       0.99      0.99      0.99     17878
       True       0.26      0.31      0.28       203

avg / total       0.98      0.98      0.98     18081

1653574003235440.5


In [10]:


X_Reg=Z.loc[Z['totals.transactionRevenue']>0]
Y_Reg=X_Reg['totals.transactionRevenue']
X_Reg1=X_Reg.drop('totals.transactionRevenue',axis=1)


print(X_Reg.shape)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_Reg1, Y_Reg, test_size=0.20)

scaler = StandardScaler()
scaler.fit(X_train1)
X_train1 = scaler.transform(X_train1)
X_test1 = scaler.transform(X_test1)

DecTreeReg = DecisionTreeRegressor()
DecTreeReg.fit(X_train1,y_train1)

predictions1 = DecTreeReg.predict(X_test1)

from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test1, predictions1))
print(y_test1,predictions1)



(1088, 56)
1.1633981804633026e+17
42346      1550000.0
81927    139000000.0
7737      69150000.0
15016     59990000.0
68565     79990000.0
10744     25820000.0
13546     16990000.0
62994    101940000.0
83271     44270000.0
54904     72270000.0
46799     44370000.0
63001    194930000.0
36159     11190000.0
36154     79990000.0
52900     57970000.0
72188    359930000.0
89107    195360000.0
10621     71700000.0
31834    491250000.0
75559     27190000.0
76769     19990000.0
73448     47970000.0
41146     30390000.0
15023     92410000.0
55498     32460000.0
30265    368900000.0
30976    121450000.0
53829    167400000.0
17862     47880000.0
49250     44790000.0
            ...     
80691    404940000.0
87651     16990000.0
45713     62150000.0
87653     53980000.0
82862     38160000.0
85792     60980000.0
32095     27160000.0
41690     18990000.0
73461     24990000.0
5708      47570000.0
58971     19190000.0
46536     33590000.0
60912     18990000.0
6979      17590000.0
76350    156820000.0
