In [10]:
import numpy as np                   
import pandas as pd                   
import matplotlib.pyplot as plt  
from sklearn.linear_model import LogisticRegression     
import seaborn as sns  
import io
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [11]:
#read_csv is the function that enables you to read csv files. it is a prebuild function in pandas
dataframe = pd.read_csv("advertising.csv")

In [12]:
#We are deciding which columns are numerical and which are categorical 
numeric_columns = ['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage' ]
categorical_columns = [ 'Ad Topic Line', 'City', 'Male', 'Country', 'Clicked on Ad' ]

In [13]:
#Transforming categorical data into codes 
"""
https://pandas.pydata.org/docs/user_guide/categorical.html
"""
dataframe['City Codes']= dataframe['City'].astype('category').cat.codes
dataframe['Country Codes'] = dataframe['Country'].astype('category').cat.codes
dataframe[['City Codes','Country Codes']].head(5)

Unnamed: 0,City Codes,Country Codes
0,960,215
1,902,147
2,111,184
3,938,103
4,804,96


In [14]:

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep]
dataframe=clean_dataset(dataframe)

In [15]:
X = dataframe.drop(labels=['Ad Topic Line','City','Country','Timestamp','Clicked on Ad'], axis=1)

Y = dataframe['Clicked on Ad']

In [16]:
#Splitting the dataset into : training set and testing set 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

In [17]:

#training of the logistic regression model 
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, Y_train)
log_reg_pred = log_reg_model.predict(X_test)

#training of the Naive Bayes model 
nav_bayes_model = GaussianNB()
nav_bayes_model.fit(X_train, Y_train)
nav_bayes_pred = nav_bayes_model.predict(X_test)

#Training of the decision tree model 
dec_tree_model = DecisionTreeClassifier()
dec_tree_model.fit(X_train, Y_train)
dec_tree_pred = dec_tree_model.predict(X_test)



In [18]:

#Accuracy using Logistic Regression 
log_reg_accuracy = metrics.accuracy_score(log_reg_pred, Y_test)
print("Accuarcy of this LG model is: \n\n", log_reg_accuracy*100)
#the classification report for precision, recall, F1 Score and support metrics: case of Logistic Regression
print('\n Classification Report for LG: \n', metrics.classification_report(log_reg_pred, Y_test))

#Accuracy using Naive Bayes-Bernoulli 
nav_bayes_accuracy = metrics.accuracy_score(nav_bayes_pred, Y_test)
print("Accuarcy of this NB-Bernoulli model is: \n\n", nav_bayes_accuracy*100)
#the classification report for precision, recall, F1 Score and support metrics: case of Naive Bayes
print('\n Classification Report for NB: \n', metrics.classification_report(nav_bayes_pred, Y_test))

#Accuracy using Decison Tree
dec_tree_accuracy = metrics.accuracy_score(dec_tree_pred, Y_test)
print("Accuarcy of this DT model is: \n\n", dec_tree_accuracy*100)
#the classification report for precision, recall, F1 Score and support metrics: case of Decision Tree
print('\n Classification Report for DT: \n', metrics.classification_report(dec_tree_pred, Y_test))

Accuarcy of this LG model is: 

 90.9090909090909

 Classification Report for LG: 
               precision    recall  f1-score   support

           0       0.94      0.89      0.91       156
           1       0.88      0.93      0.90       130

    accuracy                           0.91       286
   macro avg       0.91      0.91      0.91       286
weighted avg       0.91      0.91      0.91       286

Accuarcy of this NB-Bernoulli model is: 

 96.5034965034965

 Classification Report for NB: 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       142
           1       0.99      0.94      0.96       144

    accuracy                           0.97       286
   macro avg       0.97      0.97      0.97       286
weighted avg       0.97      0.97      0.97       286

Accuarcy of this DT model is: 

 94.4055944055944

 Classification Report for DT: 
               precision    recall  f1-score   support

           0       0.93      

In [19]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters you want to test
hyperparameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10]
}

# perform a grid search to find the best hyperparameters
log_reg_tuned = GridSearchCV(log_reg_model, hyperparameters, cv=5)
log_reg_tuned.fit(X_train, Y_train)

# use the best hyperparameters to predict on the test set
log_reg_pred_tuned = log_reg_tuned.predict(X_test)
#Accuracy using Logistic Regression 
log_reg_accuracy = metrics.accuracy_score(log_reg_pred_tuned, Y_test)
print("Accuarcy of this LG model is: \n\n", log_reg_accuracy*100)
#the classification report for precision, recall, F1 Score and support metrics: case of Logistic Regression
print('\n Classification Report for LG: \n', metrics.classification_report(log_reg_pred_tuned, Y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuarcy of this LG model is: 

 90.9090909090909

 Classification Report for LG: 
               precision    recall  f1-score   support

           0       0.94      0.89      0.91       156
           1       0.88      0.93      0.90       130

    accuracy                           0.91       286
   macro avg       0.91      0.91      0.91       286
weighted avg       0.91      0.91      0.91       286



15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



In [20]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# define the hyperparameters you want to test
hyperparameters = {
    'var_smoothing': uniform(loc=0, scale=0.1)
}

# perform a randomized search to find the best hyperparameters
nav_bayes_tuned = RandomizedSearchCV(nav_bayes_model, hyperparameters, n_iter=100, cv=5)
nav_bayes_tuned.fit(X_train, Y_train)

# use the best hyperparameters to predict on the test set
nav_bayes_pred_tuned = nav_bayes_tuned.predict(X_test)
#Accuracy using Naive Bayes-Bernoulli 
nav_bayes_accuracy = metrics.accuracy_score(nav_bayes_pred_tuned, Y_test)
print("Accuarcy of this NB-Bernoulli model is: \n\n", nav_bayes_accuracy*100)
#the classification report for precision, recall, F1 Score and support metrics: case of Naive Bayes
print('\n Classification Report for NB: \n', metrics.classification_report(nav_bayes_pred_tuned, Y_test))


Accuarcy of this NB-Bernoulli model is: 

 70.97902097902097

 Classification Report for NB: 
               precision    recall  f1-score   support

           0       0.84      0.68      0.75       183
           1       0.57      0.77      0.66       103

    accuracy                           0.71       286
   macro avg       0.71      0.72      0.70       286
weighted avg       0.74      0.71      0.72       286



In [21]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameters you want to test
hyperparameters = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# perform a grid search to find the best hyperparameters
dec_tree_tuned = GridSearchCV(dec_tree_model, hyperparameters, cv=5)
dec_tree_tuned.fit(X_train, Y_train)

# use the best hyperparameters to predict on the test set
dec_tree_pred_tuned = dec_tree_tuned.predict(X_test)
#Accuracy using Decison Tree
dec_tree_accuracy = metrics.accuracy_score(dec_tree_pred_tuned, Y_test)
print("Accuarcy of this DT model is: \n\n", dec_tree_accuracy*100)
#the classification report for precision, recall, F1 Score and support metrics: case of Decision Tree
print('\n Classification Report for DT: \n', metrics.classification_report(dec_tree_pred_tuned, Y_test))


Accuarcy of this DT model is: 

 95.1048951048951

 Classification Report for DT: 
               precision    recall  f1-score   support

           0       0.94      0.97      0.95       144
           1       0.96      0.94      0.95       142

    accuracy                           0.95       286
   macro avg       0.95      0.95      0.95       286
weighted avg       0.95      0.95      0.95       286



[2]: https://pandas.pydata.org/docs/getting_started/install.html 

[3]: https://numpy.org/install/ 


[4]: https://matplotlib.org/stable/users/installing/index.html 


[5]: https://seaborn.pydata.org/installing.html 


[6]: https://scikit-learn.org/stable/install.html 


[7]: https://github.com/PacktPublishing/Hands-On-Data-Science-and-Python-Machine-Learning/blob/master/DecisionTree.ipynb 


[8]: https://github.com/PacktPublishing/Hands-On-Data-Science-and-Python-Machine-Learning/blob/master/MeanMedianExercise.ipynb 

[9]: https://github.com/PacktPublishing/Hands-On-Data-Science-and-Python-Machine-Learning/blob/master/Python101.ipynb 


[10]: https://github.com/PacktPublishing/Hands-On-Data-Science-and-Python-Machine-Learning/blob/master/TrainTest.ipynb 



[11]: https://github.com/PacktPublishing/Hands-On-Data-Science-and-Python-Machine-Learning/blob/master/MatPlotLib.ipynb 



[12]: https://github.com/PacktPublishing/Hands-On-Data-Science-and-Python-Machine-Learning/blob/master/Outliers.ipynb 


[13]: https://github.com/asavinov/machine-learning-and-data-processing#analysis-of-different-types-of-data 


[14]: https://github.com/asavinov/machine-learning-and-data-processing#libraries-utilities-tools 


[15]: https://github.com/stefmolin/Hands-On-Data-Analysis-with-Pandas-2nd-edition 