# Machine Learning using Random Forest

### 1) install and import libraries

In [4]:
!pip install scikit-learn
!pip install --upgrade pixiedust

Collecting pixiedust
  Downloading pixiedust-1.1.19.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 25.8 MB/s eta 0:00:01
[?25hCollecting geojson
  Downloading geojson-2.5.0-py2.py3-none-any.whl (14 kB)
Collecting colour
  Downloading colour-0.1.5-py2.py3-none-any.whl (23 kB)
Building wheels for collected packages: pixiedust
  Building wheel for pixiedust (setup.py) ... [?25ldone
[?25h  Created wheel for pixiedust: filename=pixiedust-1.1.19-py3-none-any.whl size=321803 sha256=799a90aac2095c42e5745f59d81111056dc7ef265d5841df6aa842ae4efee232
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/a1/8f/a3/25f8e4f230bbbcc575afff7e6f13a90dd01c84cf03781af8f8
Successfully built pixiedust
Installing collected packages: geojson, colour, pixiedust
Successfully installed colour-0.1.5 geojson-2.5.0 pixiedust-1.1.19


In [5]:
import pixiedust
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import arff

Pixiedust database opened successfully
Table VERSION_TRACKER created successfully
Table METRICS_TRACKER created successfully

Share anonymous install statistics? (opt-out instructions)

PixieDust will record metadata on its environment the next time the package is installed or updated. The data is anonymized and aggregated to help plan for future releases, and records only the following values:

{
   "data_sent": currentDate,
   "runtime": "python",
   "application_version": currentPixiedustVersion,
   "space_id": nonIdentifyingUniqueId,
   "config": {
       "repository_id": "https://github.com/ibm-watson-data-lab/pixiedust",
       "target_runtimes": ["Data Science Experience"],
       "event_id": "web",
       "event_organizer": "dev-journeys"
   }
}
You can opt out by calling pixiedust.optOut() in a new cell.


[31mPixiedust runtime updated. Please restart kernel[0m
Table USER_PREFERENCES created successfully
Table service_connections created successfully




### 2) Read, visualise and understand CSV data

In [6]:
GITHUB_DATA_CSV = 'https://github.com/ibm-best-team/shopper-intention/raw/master/refined_cleansed_data.csv'

raw_df = pixiedust.sampleData(GITHUB_DATA_CSV)

Downloading 'https://github.com/ibm-best-team/shopper-intention/raw/master/refined_cleansed_data.csv' from https://github.com/ibm-best-team/shopper-intention/raw/master/refined_cleansed_data.csv
Downloaded 1617956 bytes
Creating pandas DataFrame for 'https://github.com/ibm-best-team/shopper-intention/raw/master/refined_cleansed_data.csv'. Please wait...
Loading file using 'pandas'
Successfully created pandas DataFrame for 'https://github.com/ibm-best-team/shopper-intention/raw/master/refined_cleansed_data.csv'


In [8]:
display(raw_df)

Admin_Pages_Visited,Admin_Pages_Visited_Duration,Site_Info_Pages_Visited,Site_Info_Pages_Visited_Duration,Product_Pages_Visited,Product_Pages_Visited_Duration,Average_Bounce_Rate,Average_Exit_Rates,Average_Page_Value,Proximity_To_Special_Day,Month,OS_Type,Browser_Type,Region,Traffic_Type,Visitor_Type,Weekend,Sales_Conversion,Special_Day,New_Returning_Others
0,0.0,0,0.0,2,54.0,0.05,0.15,0.0,0.0,May,1,1,1,1,Returning_Visitor,0,0,0,1
0,0.0,0,0.0,2,61.0,0.0,0.05,0.0,0.0,May,3,2,3,15,Returning_Visitor,0,0,0,1
5,85.66666667,0,0.0,25,1194.5,0.0,0.007692308,11.51923077,0.0,May,2,2,1,2,New_Visitor,0,1,0,0
0,0.0,0,0.0,10,127.5,0.0,0.02,0.0,0.0,Mar,3,2,1,10,Returning_Visitor,0,0,0,1
0,0.0,0,0.0,10,421.6,0.0,0.011111111,0.0,0.0,Nov,1,1,1,4,Returning_Visitor,0,0,0,1
6,378.4666667,0,0.0,13,950.6666667,0.022222222,0.054444444,0.0,0.0,Oct,1,1,1,3,Returning_Visitor,0,0,0,1
2,79.0,0,0.0,7,823.0,0.0,0.028571429,0.0,0.0,Dec,1,1,3,2,Returning_Visitor,1,0,0,1
1,14.0,0,0.0,14,988.0625,0.014285714,0.028571429,68.84905671,0.0,Dec,2,5,9,1,Returning_Visitor,0,1,0,1
0,0.0,0,0.0,62,1875.103333,0.003278689,0.027868852,12.71367924,0.0,Sep,2,4,1,1,Returning_Visitor,0,0,0,1
3,11.66666667,0,0.0,39,557.2452381,0.028947368,0.052076023,0.0,0.6,May,3,2,3,13,Returning_Visitor,0,0,0,1


In [10]:
raw_df.shape

(12330, 20)

In [11]:
raw_df.describe()

Unnamed: 0,Admin_Pages_Visited,Admin_Pages_Visited_Duration,Site_Info_Pages_Visited,Site_Info_Pages_Visited_Duration,Product_Pages_Visited,Product_Pages_Visited_Duration,Average_Bounce_Rate,Average_Exit_Rates,Average_Page_Value,Proximity_To_Special_Day,OS_Type,Browser_Type,Region,Traffic_Type,Weekend,Sales_Conversion,Special_Day,New_Returning_Others
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586,0.232603,0.154745,0.01249,0.869505
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169,0.422509,0.361676,0.111062,0.35674
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0,0.0,0.0,0.0,1.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0,0.0,0.0,0.0,1.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0,0.0,0.0,0.0,1.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0,1.0,1.0,1.0,2.0


In [97]:
list(raw_df.columns)

['Admin_Pages_Visited',
 'Admin_Pages_Visited_Duration',
 'Site_Info_Pages_Visited',
 'Site_Info_Pages_Visited_Duration',
 'Product_Pages_Visited',
 'Product_Pages_Visited_Duration',
 'Average_Bounce_Rate',
 'Average_Exit_Rates',
 'Average_Page_Value',
 'Proximity_To_Special_Day',
 'Month',
 'OS_Type',
 'Browser_Type',
 'Region',
 'Traffic_Type',
 'Visitor_Type',
 'Weekend',
 'Sales_Conversion',
 'Special_Day',
 'New_Returning_Others',
 'By_Month']

In [12]:
raw_df.head()

Unnamed: 0,Admin_Pages_Visited,Admin_Pages_Visited_Duration,Site_Info_Pages_Visited,Site_Info_Pages_Visited_Duration,Product_Pages_Visited,Product_Pages_Visited_Duration,Average_Bounce_Rate,Average_Exit_Rates,Average_Page_Value,Proximity_To_Special_Day,Month,OS_Type,Browser_Type,Region,Traffic_Type,Visitor_Type,Weekend,Sales_Conversion,Special_Day,New_Returning_Others
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0,0,1
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0,0,1


### 3) Data cleansing

In [22]:
all_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_conversion_dictionary = dict((v,k+1) for k,v in enumerate(all_months))

raw_df['By_Month'] = raw_df['Month'].map(month_conversion_dictionary)

### 4) Machine Learning using Random Forest

In [111]:
# features and target(s)

features = [
    'Admin_Pages_Visited',
    'Admin_Pages_Visited_Duration',
    'Site_Info_Pages_Visited',
    'Site_Info_Pages_Visited_Duration',
    'Product_Pages_Visited',
    'Product_Pages_Visited_Duration',
    'Average_Bounce_Rate',
    'Average_Exit_Rates',
    'Average_Page_Value',
    'Special_Day',
    'Weekend',
    'By_Month',
    'Region',
    'Browser_Type',
    'OS_Type',
    'Traffic_Type',
    'New_Returning_Others'
]

target = 'Sales_Conversion'

xVar = raw_df[features]
yVar = raw_df[target]

In [112]:
# split into training and testing sets
session_random_state = 49
x_train, x_test, y_train, y_test = train_test_split(xVar, yVar, test_size=0.25, random_state=session_random_state)

print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(9247, 17) (9247,)
(3083, 17) (3083,)


In [113]:
# train model
rfc = RandomForestClassifier(n_jobs=2, random_state=session_random_state)

rfc.fit(x_train, y_train)

RandomForestClassifier(n_jobs=2, random_state=49)

In [114]:
# create confusion matrix to gut check model
predictions = rfc.predict(x_test)

pd.crosstab(y_test, predictions, rownames=['Actual Result'], colnames=['Predicted Result'])

Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2526,87
1,209,261


In [115]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.1 degrees.


In [116]:
# Get numerical feature importances
importances = list(rfc.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:40} Importance: {:0.4f}'.format(*pair)) for pair in feature_importances];

Variable: Average_Page_Value                       Importance: 0.3800
Variable: Average_Exit_Rates                       Importance: 0.1000
Variable: Product_Pages_Visited_Duration           Importance: 0.0900
Variable: Product_Pages_Visited                    Importance: 0.0700
Variable: Admin_Pages_Visited_Duration             Importance: 0.0600
Variable: Average_Bounce_Rate                      Importance: 0.0500
Variable: Admin_Pages_Visited                      Importance: 0.0400
Variable: By_Month                                 Importance: 0.0400
Variable: Site_Info_Pages_Visited_Duration         Importance: 0.0300
Variable: Region                                   Importance: 0.0300
Variable: Traffic_Type                             Importance: 0.0300
Variable: Site_Info_Pages_Visited                  Importance: 0.0200
Variable: Browser_Type                             Importance: 0.0200
Variable: OS_Type                                  Importance: 0.0200
Variable: Weekend   