# ONLINE USER PURCHASE PREDICTION

This is my exploration of an online user data and trying to build a classification prediction model that can determine if the use is likely to make a puchase or not. 

Let's begin 

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

Importing the data and exploring the data

In [2]:
df = pd.read_csv('shopping.csv')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


### Data Cleaning and transformation

Le's see what the data entails

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Month is an object. Le's try recasting the months into their respective numerical values

In [4]:
unique_Month = df['Month'].unique()

print(unique_Month)

['Feb' 'Mar' 'May' 'Oct' 'June' 'Jul' 'Aug' 'Nov' 'Sep' 'Dec']


In [5]:
month_mapping = {
    'Jan': 0, 'Feb': 1, 'Mar': 2, 'Apr': 3,
    'May': 4, 'June': 5, 'Jul': 6, 'Aug': 7,
    'Sep': 8, 'Oct': 9, 'Nov': 10, 'Dec': 11
}

df['Month'] = df['Month'].map(month_mapping)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,1,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,1,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,1,3,3,1,4,Returning_Visitor,True,False


Let's recast True or False values into 1 and 0

In [6]:
true_or_false = {True : 1, False : 0}

df['Weekend'] = df['Weekend'].map(true_or_false)
df['Revenue'] = df['Revenue'].map(true_or_false)

df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,1,1,1,1,Returning_Visitor,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,1,2,2,1,2,Returning_Visitor,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,4,1,9,3,Returning_Visitor,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,1,3,2,2,4,Returning_Visitor,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,1,3,3,1,4,Returning_Visitor,1,0


Let's recast VistorType to int.

In [7]:
unique_categories = df['VisitorType'].unique()

print(unique_categories)

['Returning_Visitor' 'New_Visitor' 'Other']


In [8]:
visitor = {'Returning_Visitor' : 1, 'New_Visitor' : 0, 'Other' : 0}

df['VisitorType'] = df['VisitorType'].map(visitor)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,1,1,1,1,1,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,1,2,2,1,2,1,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,4,1,9,3,1,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,1,3,2,2,4,1,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,1,3,3,1,4,1,1,0


In [9]:
labels = df['Revenue']
new_df = df.drop(columns=['Revenue'])
new_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,1,1,1,1,1,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,1,2,2,1,2,1,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1,4,1,9,3,1,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,1,3,2,2,4,1,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,1,3,3,1,4,1,1


In [10]:
new_df.shape

(12330, 17)

Now let's split up the data for training.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    new_df, labels, test_size=0.4
)

### Building our K-Nearest Neighbors Classifier model 

In [30]:
model = KNeighborsClassifier()
model.fit(X_train, y_train)

Let's predict how the model fared on the training data

In [57]:
y_train_predict = model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_predict)

f'Model accuracy on training data = {accuracy * 100:.2f}%'

'Model accuracy on training data = 89.06%'

In [31]:
predictions = model.predict(X_test)

In [58]:
accuracy = accuracy_score(y_test, predictions)
f'Model accuracy on test set = {accuracy * 100:.2f}%'

'Model accuracy on test set = 86.66%'

This shows that the model is 86.66% accurate in it's predictions for data it hasn't seen before. But, can it accurately predict whether a person visiting the site will make a purchase or not. let's find out. 

### Sensitivity And Specificity 

In [21]:
def evaluation_score(labels, predictions):  
    sensitivity = 0.0
    specificity = 0.0


    #Get the number of positive values and negative values
    numNeg, numPos = labels.value_counts()

    #Go through labels and predictions to calculate sentivity and specificity
    for label, prediction in zip(labels, predictions):
        if prediction == label and label == 1:
            sensitivity = sensitivity + 1
        elif prediction == label and label == 0:
            specificity = specificity + 1
        else:
            continue

    #Divide by length so both sensitivty and specificty are between 0 to 1
    sensitivity = sensitivity / numPos
    specificity = specificity / numNeg 

    return sensitivity, specificity

In [55]:
sensitive, specific = evaluation_score(y_test, predictions)
f'Sensitivity score = {sensitive * 100:.2f}% and Specificity score = {specific * 100:.2f}%'

'Sensitivity score = 29.13% and Specificity score = 96.78%'

This shows that our model is 29.13% sensitive, and 96.78% specific.

## Let's build a function to select the best  fine-tuning parameters

We can accomplish this by finding the number of neighbors the model should build. Logically, one can say two neighbors. Let's see this in action. 

In [86]:
def model_selector(num, X_train, y_train, X_test, y_test):
    predictions = []
    
    for i in range(num):
        training_num = i + 1
        modelname = 'model' + str(training_num)
        model = KNeighborsClassifier(n_neighbors=training_num)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        
        model_accuracy = accuracy_score(y_test, y_pred)
        sens, spec = evaluation_score(y_test, y_pred)
        
        model_dict = {modelname: [model_accuracy, sens, spec]}
        predictions.append(model_dict)
        
    return predictions
        
        

In [87]:
pred_selector = model_selector(7, X_train, y_train, X_test, y_test)

In [88]:
pred_selector

[{'model1': [0.8242092457420924, 0.38346883468834686, 0.9017644253695756]},
 {'model2': [0.8637469586374696, 0.22357723577235772, 0.9763948497854077]},
 {'model3': [0.8570559610705596, 0.33875338753387535, 0.9482594182164997]},
 {'model4': [0.870235198702352, 0.24525745257452575, 0.980209823557463]},
 {'model5': [0.8665855636658556, 0.29132791327913277, 0.9678111587982833]},
 {'model6': [0.870235198702352, 0.22086720867208673, 0.9845016690510253]},
 {'model7': [0.8653690186536902, 0.2588075880758808, 0.9721030042918455]}]

Hmmm. As it turned out, using 1 neigbhor produces the best sensitivity predictor. So now let's build our model with n_neighbors=1

In [89]:
best_model = KNeighborsClassifier(n_neighbors=1)
best_model.fit(X_train, y_train)

In [90]:
y_best_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_best_pred)
f'Accuracy score for our best model = {accuracy * 100:.2f}%'

'Accuracy score for our best model = 82.42%'

In [91]:
sensitive, specific = evaluation_score(y_test, y_best_pred)
f'Sensitivity score = {sensitive * 100:.2f}% and Specificity score = {specific * 100:.2f}%'

'Sensitivity score = 38.35% and Specificity score = 90.18%'

# In the end, this is our best model's predictions:
### Sensitivity = 38.35%
### Specificity = 90.18%

## Let's try to build a Support Vector Machine SVM for training purposes. 

In [92]:
model = SVC(kernel='linear')  # You can use 'linear', 'poly', 'rbf', etc.
model.fit(X_train, y_train)


In [99]:
model.score(X_test, y_test)

0.8911192214111923

In [97]:
y_pred = model.predict(X_test)

In [100]:
sensitive, specific = evaluation_score(y_test, y_pred)
f'Sensitivity score = {sensitive * 100:.2f}% and Specificity score = {specific * 100:.2f}%'

'Sensitivity score = 38.48% and Specificity score = 98.02%'

### Without any fine-tuning, this SVM model performs quite impressively. 

In [95]:
report = classification_report(y_test, y_pred)
report

'              precision    recall  f1-score   support\n\n           0       0.90      0.98      0.94      4194\n           1       0.77      0.38      0.51       738\n\n    accuracy                           0.89      4932\n   macro avg       0.84      0.68      0.73      4932\nweighted avg       0.88      0.89      0.88      4932\n'