In [5]:
import matplotlib.pyplot as plt
import os
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')



def load_df(csv_path='./data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    p=0.1
    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in JSON_COLUMNS},
                     dtype={'fullVisitorId': 'str'}, nrows=nrows, # Important!!
                      skiprows=lambda i: i > 0 and random.random() > p)

    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

train_df = load_df()
pd.set_option('display.max_columns', None)
# print(train_df.head())
# shops_or_not=lambda x : x.train_df.totals.transactionRevenue > 0
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
train_df['shops or not'] = train_df['totals.transactionRevenue'].values > 0
# y_clf = (train_df['totals.transactionRevenue'].fillna(0) > 0).astype(np.uint8)
print(pd.value_counts(train_df['shops or not']))
# print(pd.value_counts(y_clf))

def date_format(df):
    df['date'] = pd.to_datetime(df['date'])
    df['vis_date'] = pd.to_datetime(df['visitStartTime'])
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day

date_format(train_df)

# excluded_features = [
#     'date', 'fullVisitorId', 'sessionId', 'totals.transactionRevenue',
#     'visitId', 'visitStartTime', 'vis_date'
# ]
categorical_features = [
    _f for _f in train_df.columns
    if (train_df[_f].dtype == 'object')
]

#print(categorical_features)

for f in categorical_features:
    train_df[f], indexer = pd.factorize(train_df[f])

A=train_df.fillna(0)
X=A.drop('shops or not',axis=1)
L=X.drop('date',axis=1)
Z=L.drop('vis_date',axis=1)
M=Z.drop('totals.transactionRevenue',axis=1)
y=train_df['shops or not']

# print(X)
X_train, X_test, y_train, y_test = train_test_split(M, y, test_size=0.20, random_state=101)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

DecTreeModel = DecisionTreeClassifier()
DecTreeModel.fit(X_train,y_train)

predictions = DecTreeModel.predict(X_test)

#from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
#accuracy=accuracy_score(y_test,predictions)
#print(accuracy)
#kfoldacc=cross_val_score(logmodel,Z,y, cv=5)

accuracy = cross_val_score(DecTreeModel, M, y, cv=5,scoring='accuracy')
print('Accuracy : ', np.mean(accuracy))
recall = cross_val_score(DecTreeModel, M, y, cv=5,scoring='recall')
#print(scores_final)
print('Precision : ', np.mean(recall))


Loaded train.csv. Shape: (90432, 54)
False    89239
True      1193
Name: shops or not, dtype: int64
Accuracy :  0.915596159445732
Precision :  0.43575120424738933
