In [170]:
%matplotlib inline 

import pandas as pd
import numpy as np

from pandas_profiling import ProfileReport
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score

from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# import the pickle files

In [171]:
# load data    
with open('df_data.pickle', 'rb') as f:
    df_data = pickle.load(f)
df_data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated_Duration,ExitRates,PageValues,Month,Revenue,SpecialDay_0.2,...,TrafficType_3,TrafficType_4,TrafficType_5,TrafficType_6,TrafficType_7,TrafficType_8,TrafficType_9,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_True
0,0,0.0,0,0.0,0.0,0.2,0.0,2,False,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0.0,0,0.0,64.0,0.1,0.0,2,False,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0.0,0,0.0,0.0,0.2,0.0,2,False,0,...,1,0,0,0,0,0,0,0,1,0
3,0,0.0,0,0.0,2.666667,0.14,0.0,2,False,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0.0,0,0.0,627.5,0.05,0.0,2,False,0,...,0,1,0,0,0,0,0,0,1,1


#### Create user-behavior clusters based on the full dataset's data on purchasing behaviour.

1. What number of clusters exist? What are the main differences in the cluster sizes and purchasing ratios between the various clusters?
3. Give each cluster that corresponds to a different variance in features a thorough investigation, and then pinpoint some behaviours that go with each specific cluster.

In [172]:
#selecting data before transformations for supervised learning
df_clustering = df_data.copy()
df_clustering.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated_Duration', 'ExitRates',
       'PageValues', 'Month', 'Revenue', 'SpecialDay_0.2', 'SpecialDay_0.4',
       'SpecialDay_0.6', 'SpecialDay_0.8', 'SpecialDay_1.0',
       'OperatingSystems_2', 'OperatingSystems_3', 'OperatingSystems_4',
       'OperatingSystems_5', 'OperatingSystems_6', 'OperatingSystems_7',
       'OperatingSystems_8', 'Browser_10', 'Browser_11', 'Browser_12',
       'Browser_13', 'Browser_2', 'Browser_3', 'Browser_4', 'Browser_5',
       'Browser_6', 'Browser_7', 'Browser_8', 'Browser_9', 'Region_2',
       'Region_3', 'Region_4', 'Region_5', 'Region_6', 'Region_7', 'Region_8',
       'Region_9', 'TrafficType_10', 'TrafficType_11', 'TrafficType_12',
       'TrafficType_13', 'TrafficType_14', 'TrafficType_15', 'TrafficType_16',
       'TrafficType_17', 'TrafficType_18', 'TrafficType_19', 'TrafficType_2',
       'TrafficType_20', 'TrafficType_3', 'Traffic

In [173]:
df_clustering['Revenue'].value_counts()

False    10137
True      1705
Name: Revenue, dtype: int64

In [174]:
#from helper functions

def return_feature_rank_from_RF(X_train,y_train, features): 
# Build a forest and compute the impurity-based feature importances
    forest = ExtraTreesClassifier(n_estimators=20,random_state=0)
    forest.fit(X_train, y_train)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X_train.shape[1]):
        print("%d. %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

    return (indices,importances)

In [175]:
#Calculate, print, and plot the feature importances, in descending order of importance.
y = df_clustering['Revenue'].values
del df_clustering['Revenue']

features = list(df_clustering.columns)
X = df_clustering.values

indices, importances = return_feature_rank_from_RF(X,y, features)

Feature ranking:
1. PageValues (0.318518)
2. ProductRelated_Duration (0.092071)
3. ExitRates (0.088754)
4. Administrative_Duration (0.061387)
5. Administrative (0.061031)
6. Month (0.055146)
7. Informational_Duration (0.032119)
8. Informational (0.031637)
9. Weekend_True (0.020194)
10. Region_3 (0.018343)
11. Browser_2 (0.016280)
12. TrafficType_2 (0.015153)
13. VisitorType_Returning_Visitor (0.013480)
14. OperatingSystems_2 (0.013266)
15. Region_4 (0.012047)
16. Region_2 (0.011865)
17. Region_6 (0.010093)
18. OperatingSystems_3 (0.009425)
19. Region_7 (0.009333)
20. Browser_4 (0.007884)
21. TrafficType_4 (0.007584)
22. TrafficType_3 (0.007192)
23. Region_9 (0.006816)
24. Region_8 (0.006439)
25. OperatingSystems_4 (0.006007)
26. Browser_5 (0.005830)
27. TrafficType_8 (0.005562)
28. TrafficType_10 (0.005432)
29. TrafficType_6 (0.005235)
30. Region_5 (0.004904)
31. TrafficType_5 (0.004080)
32. TrafficType_11 (0.004072)
33. TrafficType_13 (0.003702)
34. Browser_10 (0.003334)
35. TrafficTy

##### It is reasonable to look at only features with importance > 0.030

In [176]:
idx = list(np.where(importances > 0.030)[0])
selected_columns = [f for f in features if features.index(f) in idx]

print("Selected features:")
selected_columns

Selected features:


['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated_Duration',
 'ExitRates',
 'PageValues',
 'Month']