# Task 3. Semi-Supervised Learning (15 points)
Consider you have training data with the `Revenue` attribute for records from June—September only. For all records from October—December, however, `Revenue` attribute is missing. Build a semi-supervised self-labeling model to estimate `Revenue` for the missing records in October—December and then fit your classifier. Report classification performance on February—March data set with and without the self-labeled data.

1. If you do not consider the records from October—December, generate the classification performance on test data
2. After using the self-labeled data and training data together, does the classification performance on test data improve? Discuss which metrics are most important for your conclusion.

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('dat/online_shoppers_intention.csv.gz')
df.head(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
month_str2num = {
    s: i+1 for i, s in enumerate(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
}
df['Month'] = df['Month'].replace(month_str2num)
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,Returning_Visitor,True,False


In [4]:
vis_str2num = {
    s: i+1 for i, s in enumerate(['Returning_Visitor', 'New_Visitor', 'Other'])
}
df['VisitorType'] = df['VisitorType'].replace(vis_str2num)
df['VisitorType'] = df['VisitorType'].replace(vis_str2num)

In [5]:
df['Revenue'] = df['Revenue'].replace({True: 1, False: 0})
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,1,False,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,1,False,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,1,False,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,1,False,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,1,True,0


In [6]:
df['Weekend'] = df['Weekend'].replace({True: 1, False: 0})
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,1,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,1,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,1,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,1,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,1,1,0


In [7]:
df.shape

(12330, 18)

In [8]:
df_lab = df.loc[df['Month'].isin([6, 7, 8, 9])]

df_unlab = df.loc[df['Month'].isin([10, 11, 12])]
df_unlab = df_unlab.assign(Revenue = np.nan)

df_test = df.loc[df['Month'].isin([2, 3])] 

In [9]:
print(df_lab.shape)
print(df_unlab.shape)
print(df_test.shape)

(1601, 18)
(5274, 18)
(2091, 18)


In [10]:
target = 'Revenue'
X_train_lab, y_train_lab = df_lab.drop(target, axis=1), df_lab[target]
X_train_unlab, y_train_unlab = df_unlab.drop(target, axis=1), df_unlab[target]
X_test, y_test = df_test.drop(target, axis=1), df_test[target]

In [11]:
print('Labeled Train Set:', X_train_lab.shape, y_train_lab.shape)
print('Unlabeled Train Set:', X_train_unlab.shape, y_train_unlab.shape)
# summarize test set size
print('Test Set:', X_test.shape, y_test.shape)

Labeled Train Set: (1601, 17) (1601,)
Unlabeled Train Set: (5274, 17) (5274,)
Test Set: (2091, 17) (2091,)


In [12]:
y_train_unlab.info()

<class 'pandas.core.series.Series'>
Int64Index: 5274 entries, 5455 to 12329
Series name: Revenue
Non-Null Count  Dtype  
--------------  -----  
0 non-null      float64
dtypes: float64(1)
memory usage: 82.4 KB


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score as accuracy,
    recall_score as recall,
    roc_curve, roc_auc_score,
    precision_score as precision,
    f1_score
)

In [14]:
scaler =  StandardScaler()
X_train = scaler.fit_transform(X_train_lab)
X_test_0 = scaler.transform(X_test)

In [15]:
%%time
lr_model = LogisticRegression(class_weight='balanced')
lr_model.fit(X_train, y_train_lab)
lr_pred = lr_model.predict(X_test_0)

CPU times: user 49.1 ms, sys: 129 ms, total: 179 ms
Wall time: 14.5 ms


In [16]:
print("accuracy:", accuracy(y_test, lr_pred))
print("precision:", precision(y_test, lr_pred))
print("recall:", recall(y_test, lr_pred))
print("f1 score:", f1_score(y_test, lr_pred))
print("confusion matrix:")
print(confusion_matrix(y_test, lr_pred))

accuracy: 0.9488283118125299
precision: 0.6981981981981982
recall: 0.7948717948717948
f1 score: 0.7434052757793764
confusion matrix:
[[1829   67]
 [  40  155]]


In [17]:
from sklearn.semi_supervised import LabelPropagation

X_train_mixed = pd.concat([X_train_lab, X_train_unlab])
# create "no label" for unlabeled data
nolabel = [-1 for _ in range(len(y_train_unlab))]
# recombine training dataset labels
y_train_mixed = pd.concat([y_train_lab, pd.Series(nolabel)])

In [18]:
scaler =  StandardScaler()
X_train_mixed_0 = scaler.fit_transform(X_train_mixed)
#X_test_0 = scaler.transform(X_test)

In [19]:
%%time
lr_model = LogisticRegression(class_weight='balanced')
lr_model.fit(X_train_mixed_0, y_train_mixed)
lr_pred_0 = lr_model.predict(X_test_0)

CPU times: user 1.1 s, sys: 3.51 s, total: 4.61 s
Wall time: 323 ms


In [20]:
print("accuracy:", accuracy(y_test, lr_pred_0))
print("precision:", precision(y_test, lr_pred_0))
print("recall:", recall(y_test, lr_pred_0))
print("f1 score:", f1_score(y_test, lr_pred_0))
print("confusion matrix:")
print(confusion_matrix(y_test, lr_pred_0))

accuracy: 0.9406982305117169
precision: 0.6392156862745098
recall: 0.8358974358974359
f1 score: 0.7244444444444443
confusion matrix:
[[1804   92]
 [  32  163]]


In [21]:
from sklearn.metrics import classification_report

In [23]:
print("Performance report on labels only training")
print(classification_report(y_test, lr_pred))

print("Performance report on labels and self-labels training")
print(classification_report(y_test, lr_pred_0))

Performance report on labels only training
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1896
           1       0.70      0.79      0.74       195

    accuracy                           0.95      2091
   macro avg       0.84      0.88      0.86      2091
weighted avg       0.95      0.95      0.95      2091

Performance report on labels and self-labels training
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      1896
           1       0.64      0.84      0.72       195

    accuracy                           0.94      2091
   macro avg       0.81      0.89      0.85      2091
weighted avg       0.95      0.94      0.94      2091

