In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import plot_roc_curve, roc_curve, auc, accuracy_score, 
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

%matplotlib inline

In [2]:
# read the data
df = pd.read_csv("./data/advertising.csv")
df.head(3)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0


In [5]:
# Let us add a column than represents a length of Ad text
# which can be an impactful factor
df["TopicLength"] = df["Ad Topic Line"].apply(len)

In [6]:
# there are no missing data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
 10  TopicLength               1000 non-null   int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 86.1+ KB


In [7]:
# explore numerical data exploration shows
# that numerical data should be scaled
df.describe()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,TopicLength
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,65.0002,36.009,55000.00008,180.0001,0.481,0.5,33.394
std,15.853615,8.785562,13414.634022,43.902339,0.499889,0.50025,5.741928
min,32.6,19.0,13996.5,104.78,0.0,0.0,17.0
25%,51.36,29.0,47031.8025,138.83,0.0,0.0,29.0
50%,68.215,35.0,57012.3,183.13,0.0,0.5,33.0
75%,78.5475,42.0,65470.635,218.7925,1.0,1.0,37.0
max,91.43,61.0,79484.8,269.96,1.0,1.0,55.0


In [8]:
superFlow = ColumnTransformer([
    ("ScaleAgeFare", StandardScaler(), ["Daily Time Spent on Site","Age","Area Income",
                                        "Daily Internet Usage","TopicLength"]),
    ('letItBe', 'passthrough', ["Male"])
])

In [9]:
logReg = Pipeline(steps=
                  [('DataEngineering', superFlow),
                   ('Algo', LogisticRegression(max_iter=1000))
                  ])

forestGump = Pipeline(steps=
                  [('DataEngineering', superFlow),
                   ('Algo', RandomForestClassifier(max_depth=4))
                  ])

vectorSupportPoly = Pipeline(steps=
                  [('DataEngineering', superFlow),
                   ('Algo', SVC(kernel='poly', degree=7))
                  ])

vectorSupportRBF = Pipeline(steps=
                  [('DataEngineering', superFlow),
                   ('Algo', SVC(kernel='rbf', degree=7))
                  ])

vectorSupportSigmoid = Pipeline(steps=
                  [('DataEngineering', superFlow),
                   ('Algo', SVC(kernel='sigmoid', degree=7))
                  ])

In [10]:
X, y = df.drop("Clicked on Ad, Timestamp, Ad Topic Line, City, Country".split(", "), axis=1), df["Clicked on Ad"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify=y)

In [11]:
logReg.fit(X_train, y_train)
forestGump.fit(X_train, y_train)
vectorSupportPoly.fit(X_train, y_train)
vectorSupportRBF.fit(X_train, y_train)
vectorSupportSigmoid.fit(X_train, y_train)
print("Finished")

Finished


In [12]:
print("===LogisticRegration====")
print(logReg.score(X_test, y_test))
print("===RandomForest====")
print(forestGump.score(X_test, y_test))
print("===SVM (Poly)====")
print(vectorSupportPoly.score(X_test, y_test))
print("===SVM (RBF)====")
print(vectorSupportRBF.score(X_test, y_test))
print("===SVM (Sigmoid)====")
print(vectorSupportRBF.score(X_test, y_test))

===LogisticRegration====
0.955
===RandomForest====
0.955
===SVM (Poly)====
0.875
===SVM (RBF)====
0.945
===SVM (Sigmoid)====
0.945


In [13]:
predictions = logReg.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       100
           1       0.98      0.93      0.95       100

    accuracy                           0.95       200
   macro avg       0.96      0.96      0.95       200
weighted avg       0.96      0.95      0.95       200

