In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../datasets/ecommerce-dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. EDA (Exploratory Data Analysis)

In [None]:
# load event dataset.
e_df=pd.read_csv('../datasets/ecommerce-dataset/events.csv')
e_df.head()

In [None]:
# view the number of data
print("This data contains ",e_df.shape[0], "cases of user activities in e-commerce website")

In [None]:
# view missing values in each columns
e_df.isnull().sum()

### Except in 'transactionid', there are no missing values.

In [None]:
# view counts by event type
e_df.event.value_counts()

In [None]:
e_df.groupby('event')['transactionid'].count()

In [None]:
e_df.groupby('event')['visitorid'].count()

### All transaction events have transaction ID.

In [None]:
# remove duplicates if any. (all columns' values are idential)
print('Number of rows before removing duplicates: ', e_df.shape[0])
msk=e_df.duplicated()
e_df=e_df[~msk]
e_df.reset_index(drop=True, inplace=True)
print('Number of rows after removing duplicates: ', e_df.shape[0])

### As dates in current dataset is in unix timestamp (the number of seconds since 1970-01-01), I'll convert them into readable dates.

In [None]:
# convert unix timestamp to readable dates (GMT)te
# separate the 'ordinary' timestamp and the milliseconds
list=[]
for i, unix in enumerate(e_df['timestamp']):
    timestamp, ms = divmod(unix, 1000)

    # create the datetime from the timestamp 
    # add the milliseconds separately
    dt = datetime.datetime.fromtimestamp(timestamp) + datetime.timedelta(milliseconds=ms)

    formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    list.append(formatted_time)

e_df['date_time']=pd.DataFrame({'date_time':list})
e_df.head()


In [None]:
print('Start Date of Dataset: ', e_df['date_time'].min())
print('End Date of Dataset: ', e_df['date_time'].max())

### This dataset is recorded from 3rd May, 2015 to 18th September, 2015 (GMT).

In [None]:
# distribution by event
# plot
totalcases=e_df.shape[0]
sns.set_style('whitegrid')

fig, (ax1,ax2) = plt.subplots(1,2, figsize=(20,8))
sns.histplot(x = 'event', data = e_df, bins = 3, ax = ax1)
ax1.set_ylabel('Number of Events (Unit: million)')
ax1.set_title('Distribution by Event Type')
for x, counts in zip(e_df['event'].unique().tolist(), e_df['event'].value_counts().tolist()):
    text=str(round((counts/totalcases)*100,2)) +'%'
    ax1.text(x, counts, text, fontsize=12)
    
# unique number of visitors by event
events = e_df['event'].unique().tolist()
unique_num = []
ratios = []
for event in events:
    uni_visit = len(e_df['visitorid'][e_df['event']==event].unique())
    unique_num.append(uni_visit)
    ratios.append(uni_visit/len(set(e_df['visitorid'])))

sns.barplot(x = events, y = unique_num, ax = ax2)

ratio_p = [str(round(r*100,2)) + '%' for r in ratios]
for e, c, r in zip(range(3), unique_num, ratio_p):
    ax2.text(e, c, r, fontsize=12)
    
ax2.set_title('Unique Number of Visitors')
plt.show()

### When analysing the distribution of events, 'View' occupies 96.67%, 'Add to cart' 2.52%, 'Transaction' 0.81%.
### The number of unique visitors was around the half of total number in all three types of event, for example, total view is 2.6 million, but unique visitor for the views is the half, which means that on average one visitor caused 2 actions.

## Q1. How many times did a customer view an item before making purchase decision?

In [None]:
# obtain vistor id,item id, and date time of 'tranaction'
item_tra=e_df[['visitorid','itemid','date_time']][e_df['event']=='transaction']
# obtain vistor id,item id, and date time of 'add to cart'
item_atc=e_df[['visitorid','itemid','date_time']][e_df['event']=='addtocart']
# obtain vistor id,item id, and date time of 'view'
item_viw=e_df[['visitorid','itemid','date_time']][e_df['event']=='view']

In [None]:
# create a dataframe of visitor, itemid found in all three events
m=item_tra.merge(item_atc, how='inner', on=['visitorid','itemid'], suffixes=[' (transaction)', ' (add_to_cart)'])
m=m.merge(item_viw, how='inner', on=['visitorid','itemid'])
m=m.rename(columns={'date_time':'date_time (view)'})
m.head()

### 'date_time (transaction)'

In [None]:
# convert datatype of the date columns
m['date_time (transaction)']=pd.to_datetime(m['date_time (transaction)'])
m['date_time (add_to_cart)']=pd.to_datetime(m['date_time (add_to_cart)'])
m['date_time (view)']=pd.to_datetime(m['date_time (view)'])

In [None]:
# the following dataframe is about the cases that a visitor viewed an item before making purchase decision

# find rows with time difference is larger than 0 minute
msk=(m['date_time (transaction)']-m['date_time (view)'])>np.timedelta64(0,'m')
m1=m[msk]

### In the dataframe above, multiple views are mixed in for one transaction, which means there are cases that a visitor checked the item multiple times. I'll check both timelines: one is from the first item view to transaction, and other is from the last item view to transaction.

In [None]:
# seperate multiple-view transaction and single-view transaction

# using duplicated function
mul_viw=m1.duplicated(subset=['visitorid','itemid'], keep=False)
sig_viw=m1[~mul_viw]
mul_viw=m1[mul_viw]

# last occurrence in duplicates = view when a visitor made purchase
# therefore, filter dataframe to leave only views before last occurence
notlast=mul_viw.duplicated(subset=['visitorid','itemid'], keep='last')  # except for last occurance

# count the number of view
# obtain the average number of it
avg_viw=mul_viw[notlast].groupby(['visitorid','itemid']).count()['date_time (view)'].mean()
print('Average Number of Views Before Purchase: {0:.0f}'.format(avg_viw))

In [None]:
# view basic statistics to see outlier
mul_viw[notlast].groupby(['visitorid','itemid']).count()['date_time (view)'].describe()

### There are some outliers such as 844 views for buying a single item, and hence I will check the distribution of view counts to have a better idea about general purchase cases. 

In [None]:
# count the number of view using groupby function
n_viw=mul_viw[notlast].groupby(['visitorid','itemid']).count()['date_time (view)']
n_viw=pd.DataFrame(n_viw)
n_viw.head()

### The first row in the dataframe above indicates that Visitor(id=172) had viewed the item(id=10034) four times before resolving to purchase. The second row is about another item the same visitor purchased and this time, the visitor viewed that item five times.  

In [None]:
# create a dataframe with the count of view from the previous cell
n_viw2=pd.DataFrame(n_viw.value_counts(), columns=['count'])
n_viw2=n_viw2.reset_index()

# add a row with the value of instant purchase
# item view at the time of transaction counted as '0' here
n_viw2.loc[len(n_viw2)]=['0',sig_viw.shape[0]]

# add the column of share in percentage
n_viw2=n_viw2.sort_values('count', ascending=False)
n_viw2['share (%)']=((n_viw2['count']/(n_viw2['count'].sum()))*100).round(2)
n_viw2=n_viw2.reset_index(drop=True)
n_viw2

In [None]:
# aggregate values lower than 5th highest values into 'others'
n_viw3=n_viw2[0:5].copy()
others=n_viw2[5::].sum().tolist()
others[0]='others'
n_viw3.loc[len(n_viw3)]=others

# set new index
n_viw3['index']=['instant purchase', 'view 1', 'view 2','view 3', 'view 5', 'others']
n_viw3=n_viw3.set_index(['index'])

In [None]:
# generate a pie plot of share by number of view before transaciton
n_viw3.plot(y='share (%)', kind='pie', autopct='%1.1f%%',  shadow=True, startangle=-90, legend=False, figsize=(8,8), fontsize=20)
plt.title('The Number of Item Views Before Purchase Decision', fontsize=20, pad=20)
plt.ylabel('')
plt.show()

### When analysing the item view numbers, I found around 50% of transactions were made without more-than-once view: a visitor checked an item, added to cart and checked out. About 30% of transactions were made after a buyer view an item once or twice. In summary, 80% of total transactions were made after less-than-three-times item view. 

## Q2. How many times does it take for one single transaction to be made?

In [None]:
# single view
# calcuate the average time period for a visitor to take for purchase
diff_s=sig_viw['date_time (transaction)']-sig_viw['date_time (view)']
avg_time=(diff_s).mean()
totalsec=avg_time.seconds
hrs, remainder = divmod(totalsec,3600)
mins, sec = divmod(remainder,60)
print('Time Period From Single View to Transaction: %s hour %s mininutes %s seconds \n' % (hrs,mins,sec))

# multiple view
# calcuate the average time period for a visitor to take for purchase

# sort values
mul_viw=mul_viw.sort_values(['visitorid','itemid','date_time (view)'])
mul_viw=mul_viw.reset_index(drop=True)

# initial view to transaction
notinitial=mul_viw.duplicated(subset=['visitorid','itemid'], keep='first') # except for first occurance
mul_viw1=mul_viw[~notinitial] # only first occurence

# get the time difference
diff_m1=mul_viw1['date_time (transaction)']-mul_viw1['date_time (view)']
avg_time1=(diff_m1).mean()
totaldays1=avg_time1.days
totalsec1=avg_time1.seconds
hrs1, remainder1 = divmod(totalsec1,3600)
mins1, sec1 = divmod(remainder1,60)
print('Time Period From Multiple View (initial) to Transaction: %s days %s hours %s mininutes %s seconds' % (totaldays1,hrs1,mins1,sec1))

# last view to transaction
notlast=mul_viw.duplicated(subset=['visitorid','itemid'], keep='last')  # except for last occurance
mul_viw2=mul_viw[~notlast] # only last occurence

# get the time difference
diff_m2=mul_viw2['date_time (transaction)']-mul_viw2['date_time (view)']
avg_time2=(diff_m2).mean()
totaldays2=avg_time2.days
totalsec2=avg_time2.seconds
hrs2, remainder2 = divmod(totalsec2,3600)
mins2, sec2 = divmod(remainder2,60)
print('Time Period From Multiple View (last) to Transaction: %s days %s hours %s mininutes %s seconds' % (totaldays2,hrs2,mins2,sec2))

In [None]:
def quan_list(percentile):
    list=[diff_s.quantile(percentile).seconds,diff_m1.quantile(percentile).seconds,diff_m2.quantile(percentile).seconds]
    return list

In [None]:
def time_cal(column):
    hr, remainder=divmod(column, 3600)
    mins,sec=divmod(remainder, 60)
    list=[]
    for i in range(len(column)):
        t='%sh%sm%ss'%(hr[i],mins[i],sec[i])
        t=str(t)
        list.append(str(t))
    return list

In [None]:
q1=quan_list(.25)
q2=quan_list(.50)
q3=quan_list(.75)

data={'Transaction Type':['Single View','Multiple Views(initial)','Multiple Views(last)'],\
     '25th percentile': q1,\
     '50th percentile': q2,\
     '75th percentile': q3}
data=pd.DataFrame(data)
data=data.set_index('Transaction Type')
t_data=data.transpose()

q1time=time_cal(t_data['Single View'])
q2time=time_cal(t_data['Multiple Views(initial)'])
q3time=time_cal(t_data['Multiple Views(last)'])

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(10,8))

# plot the same data on both axes
data.plot(ax=ax1, kind='bar')
data.plot(ax=ax2, kind='bar')

# zoom-in / limit the view to different portions of the data
ax1.set_ylim(30000, 35000)  # outliers only
ax2.set_ylim(0, 2000)  # most of the data

# hide the spines between ax and ax2
ax1.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax1.xaxis.tick_top()
ax1.tick_params(labeltop='off')  # don't put tick labels at the top
ax2.xaxis.tick_bottom()
ax2.set_xticklabels(ax2.get_xticklabels(), rotation = 45, ha="right", fontsize=15)
ax2.legend('')

# This looks pretty good, and was fairly painless, but you can get that
# cut-out diagonal lines look with just a bit more work. The important
# thing to know here is that in axes coordinates, which are always
# between 0-1, spine endpoints are at these locations (0,0), (0,1),
# (1,0), and (1,1).  Thus, we just need to put the diagonals in the
# appropriate corners of each of our axes, and so long as we use the
# right transform and disable clipping.

d = .015  # how big to make the diagonal lines in axes coordinates
# arguments to pass to plot, just so we don't keep repeating them
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d, +d), (-d, +d), **kwargs)        # top-left diagonal
ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-right diagonal

kwargs.update(transform=ax2.transAxes)  # switch to the bottom axes
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal

x=[-0.4,-0.1,0]
for x,y,qt in zip(x,t_data['Single View'],q1time):
    plt.text(x, y+500, qt, fontsize=12)
x=[0.6,0.9,1]
for x,y,qt in zip(x,t_data['Multiple Views(initial)'],q2time):
    if x<1:
        plt.text(x, y+200, qt, fontsize=12)
    else:
        plt.text(x, 4000, qt, fontsize=12)
x=[1.6,1.9,2]
for x,y,qt in zip(x,t_data['Multiple Views(last)'],q3time):
    plt.text(x, y+500, qt, fontsize=12 )
ax1.set_ylabel('time period (second)')
ax2.set_ylabel('time period (second)')
ax1.set_title('Time taken to make purchase decision', pad=50, fontsize=20)
plt.show()

### In case of single-view transactions, they generally take around 3 to 11 minutes to reach the end of buying journey (view to transaction). On the other hand, multiple-view transactions showed more dispersed time range, about 9 minutes to 9.5 hours with 30 minutes as a median. However, when multiple-view buyers checked the product for the last time before purchase, they took similar short amount of time to single-view buyers.

## Q3. Is there any relationship between Item Category and Transaction?

In [None]:
# load item-related datasets
cate=pd.read_csv('../datasets/ecommerce-dataset/category_tree.csv')
item1=pd.read_csv('../datasets/ecommerce-dataset/item_properties_part1.csv')
item2=pd.read_csv('../datasets/ecommerce-dataset/item_properties_part2.csv')

In [None]:
# view category tree dataset
cate.head()

In [None]:
len(cate.categoryid.unique())

In [None]:
len(cate.parentid.unique())

### 'Category ID' is a subset or child of 'Parent Id'.

In [None]:
cate.info()

In [None]:
# view item dataset 1
item1.head()

In [None]:
item1.property.value_counts()

### Only'categoryid' and 'available' were given in the column 'property' while the rest are hashed for confidentiality purpose. I will use only categoryid and hence leave only the rows with that.

In [None]:
item1=item1[item1.property=='categoryid']
item1.reset_index(drop=True, inplace=True)
item1.shape

In [None]:
# do the same with item2
item2=item2[item2.property=='categoryid']
item2.reset_index(drop=True, inplace=True)
item2.shape

In [None]:
# drop column, timestamp and property
item1=item1.drop(columns=['timestamp','property'])
item2=item2.drop(columns=['timestamp','property'])
item1.head()

In [None]:
# stack two item dataframes
item=pd.concat([item1, item2], ignore_index=True)
item.shape

In [None]:
# reset index
item.reset_index(drop=True, inplace=True)

# rename column name 'value' to 'categoryid'
item.rename(columns={'value':'categoryid'}, inplace=True)
item.head()

In [None]:
# check duplitcates
item.duplicated().value_counts()

In [None]:
# drop duplicates
msk=item.duplicated()
item=item[~msk]

In [None]:
# count the number of category id by item id
(item.groupby('itemid').count().categoryid>1).value_counts()

In [None]:
# number of category id
# create a dataframe
num_cate=item.groupby('itemid').count()
num_cate.reset_index(inplace=True)
num_cate.rename(columns={'categoryid':'num_categoryid'}, inplace=True)
num_cate.head()

In [None]:
cate.info()

In [None]:
item.info()

In [None]:
item.categoryid=item.categoryid.astype(int)

In [None]:
# merge item dataframe with category tree to link item id with parent id
item=item.merge(cate, how='inner', on='categoryid')
item.head()

In [None]:
# create a dataframe with 'itemid' and 'parent id'
ip=item[['itemid','parentid']]

# check duplicates
ip.duplicated().value_counts()

In [None]:
# remove duplicates
msk=ip.duplicated()
ip=ip[~msk]

# view the number of parent ids by item id
ip.groupby('itemid').count()

In [None]:
# number of parent id
# create a dataframe
num_pare=ip.groupby('itemid').count()
num_pare.reset_index(inplace=True)
num_pare.rename(columns={'parentid':'num_parentid'}, inplace=True)

In [None]:
num_pare

In [None]:
# call dataframe that includes 'transaction' information. 
item_tra.shape

In [None]:
item_viw.reset_index(drop=True, inplace=True)

In [None]:
item_viw

In [None]:
msk=item_viw.duplicated(['visitorid','itemid'])
item_viw=item_viw[~msk]

In [None]:
# visitor id and item id that didn't lead to transaction
nta=pd.concat([item_viw,item_tra], ignore_index=True)
msk=nta.duplicated(['visitorid','itemid'], keep=False)
nta=nta[~msk]
nta.reset_index(drop=True, inplace=True)
nta

In [None]:
# randomly select 25000 observation from nta(no-transaction) dataframe above
index=np.random.choice(nta.index, 25000)
nta_r=nta.loc[index].reset_index(drop=True)
nta_r.head()

In [None]:
# add a new column
nta_r['purchase']=[0]*nta_r.shape[0]
nta_r.head()

In [None]:
nta_r.shape

In [None]:
# add a new column
item_tra['purchase']=[1]*item_tra.shape[0]
item_tra.head()

In [None]:
item_tra.reset_index(drop=True, inplace=True)

In [None]:
item_tra.shape

In [None]:
# concatenate transaction data and non-transaction data (the cases that a visitor after all didn't buy the item viewed during this data collection period)
data=pd.concat([nta_r,item_tra], ignore_index=True).sort_values('date_time').reset_index(drop=True)
data.head()

In [None]:
# merge with category id, parent id dataframe
data=data.merge(num_cate, how='inner', on='itemid')
data=data.merge(num_pare, how='inner', on='itemid')

In [None]:
data.groupby('num_categoryid').sum().purchase

In [None]:
data.groupby('num_parentid').sum().purchase

### I had assumed that the more Category ID or Parent ID an item has, the more transactions it might have as the item could have exposed in multiple category pages. However, when I checked the distribution of data, the number of items with multiple categories was not many.

In [None]:
# pick items under one category and view each sales number.
cate1=data[data.num_categoryid==1]
cate1=cate1.merge(item[['itemid','categoryid']], how='inner', on=['itemid'])
cate1_sales=cate1.groupby('categoryid').sum().purchase

# plot the sales number along the category id. 
pd.DataFrame(cate1_sales).plot()
plt.ylabel('purchase')
plt.show()

### There is no general trend in the number of sales along category id.

In [None]:
# pick items under one parent id and view each sales number.
pare1=data[data.num_parentid==1]
pare1=pare1.merge(item[['itemid','parentid']], how='inner', on=['itemid'])
pare1_sales=pare1.groupby('parentid').sum().purchase

# plot the sales number along the parent id.
pd.DataFrame(pare1_sales).plot()
plt.ylabel('purchase')
plt.show()

### There is no general trend in the number of sales along parent id.

In [None]:
# the item distribution by cateogory id
item.categoryid.hist()
plt.show()

In [None]:
# the item distribution by parent id
item.parentid.hist()
plt.show()

### There was no clear relationship between category and transaction (sales). 

## Q4. Is there any relationship between View Time and Transaction?

In [None]:
# sort data by date time an in ascending order. 
data=data.sort_values('date_time').reset_index(drop=True)
data.head()

In [None]:
# convert data type of the column, date_time to use date_time functions.
data.date_time=pd.to_datetime(data.date_time)
data.info()

In [None]:
# extract days of week of each date and add them into a new column.
data['dayofweek']=data.date_time.dt.dayofweek

In [None]:
# count the number of date by day of week.
data.groupby('dayofweek').count()

In [None]:
# view how many purchase there are by day of week.
data[data.purchase==1].groupby('dayofweek').count()

In [None]:
# compare the number of purchase with that of non-purchase by day of week.
plt.plot(data[data.purchase==0].groupby('dayofweek').count(), color='red')
plt.plot(data[data.purchase==1].groupby('dayofweek').count(), color='blue')
plt.legend(['No purchase','Purchase'], labelcolor=['red','blue'])
plt.show()

In [None]:
# extract hour and week number and add them to a new column, respectively
data['hour']=data.date_time.dt.hour
data['week']=data.date_time.dt.isocalendar().week 

In [None]:
# convert data type
item_viw['date_time']=pd.to_datetime(item_viw['date_time'])
item_viw.info()

In [None]:
# convert data type
item_tra['date_time']=pd.to_datetime(item_tra['date_time'])
item_tra.info()

In [None]:
# count each visitor's previous view of each item
list=[]
for i in range(len(data)):
    row=item_viw[(item_viw['visitorid']==data.loc[i,'visitorid'])&(item_viw['itemid']==data.loc[i,'itemid'])&(item_viw['date_time']<data.loc[i,'date_time'])]

    if len(row)==0:
        list.append(0)

    else:
        list.append(len(row))
list[0:10]

In [None]:
# add a new column, 'previous view'
data['previous_view']=pd.DataFrame({'view_count':list})
data.head()

In [None]:
# count each visitor's total number of transaction in the past
list2=[]
for i in range(len(data)):
    row=item_tra[(item_tra['visitorid']==data.loc[i,'visitorid'])&(item_tra['date_time']<data.loc[i,'date_time'])]

    if len(row)==0:
        list2.append(0)

    else:
        list2.append(len(row))
list2[0:10]

In [None]:
# add a new column, 'previous transaction'
data['previous_transaction']=pd.DataFrame({'previous_transaction':list2})

In [None]:
plt.plot(data[['dayofweek','purchase']].groupby('dayofweek').sum())
plt.xticks(np.arange(7),['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], rotation=45)
plt.ylabel('Purchase')
plt.show()

In [None]:
plt.plot(data[['hour','purchase']].groupby('hour').sum())
plt.ylabel('Hour')
plt.ylabel('Purchase')
plt.show()

In [None]:
plt.plot(data[['week','purchase']].groupby('week').sum())
plt.xlabel('Week')
plt.ylabel('Purchase')
plt.show()

In [None]:
plt.plot(data[['previous_view','purchase']].groupby('previous_view').sum())
plt.xlabel('Number of previous item view (current)')
plt.ylabel('Purchase')
plt.show()

In [None]:
plt.plot(data[['previous_transaction','purchase']].groupby('previous_transaction').sum())
plt.xlim(0,5)
plt.xlabel("Number of a visitor's previous transaction (total)")
plt.ylabel('Purchase')
plt.show()

### choosen feature: dayofweek, hour, previous view, previous transaction 

## 2. Machine Learning Classification Modeling

In [None]:
from sklearn import preprocessing 

In [None]:
# convert features to numpy array
X=data[['dayofweek','hour','previous_view','previous_transaction']].values
X[0:5]

In [None]:
# convert a target values to numpy array
y=data['purchase'].values
y[0:5]

## Step 01. Normalize data

In [None]:
X=preprocessing.StandardScaler().fit(X).transform(X.astype(float))
X[0:5]

## Step 02. Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
print('Train Set: ', X_train.shape, y_train.shape)
print('Test Set: ', X_test.shape, y_test.shape)

## Step 03-1. K-Nearest Neighbors Algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
Ks=10
mean_acc=np.zeros((Ks-1)) # create numpy array of which elements are nine 0s.
std_acc=np.zeros((Ks-1))

for n in range(1, Ks):
    
    # train model
    clf=KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
    yhat=clf.predict(X_test)
    mean_acc[n-1]=metrics.accuracy_score(y_test, yhat) # make a list of accuracy score by model
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

In [None]:
plt.plot(range(1,Ks), mean_acc, 'g') # x axis is 1 to 10, y axis is accuracy record, green line.
plt.fill_between(range(1, Ks), (mean_acc)-(1 * std_acc), (mean_acc)+(1*std_acc), alpha=0.10)
plt.legend(('Accuracy', '+/- 3xstd')) # double parentheses
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors(K)')
plt.tight_layout()
plt.show()

In [None]:
n=7
KNN_7=KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)

### The best model is when k value is 7 with 92% of model accuracy.

## Step 03-2. Decision Tree Algorithm

In [None]:
from sklearn.tree import DecisionTreeClassifier
PurchaseTree=DecisionTreeClassifier(criterion='entropy', max_depth=4)
PurchaseTree.fit(X_train, y_train)
predTree=PurchaseTree.predict(X_test)
print(predTree[0:5])
print(y_test[0:5])

In [None]:
# evaluation

print("Decision Tree's Accuracy: {0:.2f}%".format(metrics.accuracy_score(y_test, predTree)*100))

## Step 03-3. Logistic Regression Algorithm

In [None]:
from sklearn.linear_model import LogisticRegression

# 'c' parameter = inverse of regularization (the smaller, the stronger regularization)
LR=LogisticRegression(C=0.01, solver='liblinear').fit(X_train, y_train)
yhat=LR.predict(X_test)
yhat_proba=LR.predict_proba(X_test)
yhat_proba[0:10]

In [None]:
# evaluation

# visualize 'Confusion Matrix'
import itertools

def plot_confusion_matrix (cm, classes, 
                         normalize=False, 
                         title='Confusion matrix',
                         cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be appliced by setting 'normalize=True'.
    """
    
    if normalize:
        cm=cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]
        print('Normalized confusion matrix')
    else:
        print('Confusion matrix, without normalization')
        
    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks=np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt='.2f' if normalize else 'd'
    thresh=cm.max()/2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,i, format(cm[i,j], fmt),
                 horizontalalignment='center',
                 color='white' if cm[i,j]> thresh else 'black')
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# compute confusion matrix
cnf_matrix=metrics.confusion_matrix(y_test, yhat, labels=[1,0])
np.set_printoptions(precision=2)

# plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Purchase=1','No Purchase=0'], normalize=False, title='Confusion matrix')


# classification report
print('\nClassification Report\n',metrics.classification_report(y_test, yhat))


# jaccard score
print('Jaccard Score: ', metrics.jaccard_score(y_test,yhat))

# logloss
print('\nLogloss: ',metrics.log_loss(y_test, yhat_proba))
# more ideal classifiers have progressively smaller values of log loss.
