In [20]:
# import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
df = pd.read_csv("advertising.csv")

In [22]:
df.drop(columns=["Ad Topic Line", "City", "Country", "Timestamp"], inplace=True)

# SlowerVersion

In [23]:
#Separating Features and Targets
target_var = "Clicked on Ad"
y = df[target_var]
X = df.drop(target_var, axis=1)

In [24]:
#Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [25]:
# Scaling the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [26]:
# Training the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)


In [27]:
#Making the Predictions and Evaluating the Model
y_pred = model.predict(X_test_scaled)

# Classification report
print(classification_report(y_test, y_pred))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy}")


              precision    recall  f1-score   support

           0       0.95      0.97      0.96       120
           1       0.98      0.95      0.96       130

    accuracy                           0.96       250
   macro avg       0.96      0.96      0.96       250
weighted avg       0.96      0.96      0.96       250

Test set accuracy: 0.964


In [28]:
# Performing Grid Search for Hyperparameter Tuning
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring="f1", cv=5)
grid_search.fit(X_train_scaled, y_train)

print(f"Best cross-validation score: {grid_search.best_score_}")
print(f"Best parameters: {grid_search.best_params_}")


Best cross-validation score: 0.9739792165832041
Best parameters: {'C': 1}


In [29]:
# Using the Best Model from Grid Search
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)

# Classification report for the best model
print(classification_report(y_test, y_pred_best))

# Accuracy score for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Test set accuracy with best model: {accuracy_best}")


              precision    recall  f1-score   support

           0       0.95      0.97      0.96       120
           1       0.98      0.95      0.96       130

    accuracy                           0.96       250
   macro avg       0.96      0.96      0.96       250
weighted avg       0.96      0.96      0.96       250

Test set accuracy with best model: 0.964


# Using Pipelines for Efficiency


In [15]:
#Juxtapose this with Pipelines
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [16]:
#Pipeline
pipe = make_pipeline(StandardScaler(), LogisticRegression())

param_grid = [
    {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10]}
]

clf = GridSearchCV(pipe, param_grid, scoring="f1")

clf.fit(X_train, y_train)
print(f"{clf.best_score_}: {clf.best_params_}")

0.9739792165832041: {'logisticregression__C': 1}


In [17]:
#Trying Another Model

In [18]:
#Xgboost

pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier())


param_grid = {
    'gradientboostingclassifier__n_estimators': [50, 100],       # Number of boosting stages
    'gradientboostingclassifier__learning_rate': [0.1, 0.2],     # Learning rate shrinks the contribution of each tree
    'gradientboostingclassifier__max_depth': [3, 4],             # Maximum depth of the individual regression estimators
    'gradientboostingclassifier__subsample': [0.8, 1.0]          # Fraction of samples used for fitting the individual base learners
}


clf = GridSearchCV(pipe, param_grid, scoring="f1")

clf.fit(X_train, y_train)
print(f"{clf.best_score_}: {clf.best_params_}")

0.9701539044251379: {'gradientboostingclassifier__learning_rate': 0.1, 'gradientboostingclassifier__max_depth': 3, 'gradientboostingclassifier__n_estimators': 100, 'gradientboostingclassifier__subsample': 0.8}


In [19]:
best_model = clf.best_estimator_
best_model.score(X_test, y_test)

0.932

# Advantages of using a pipeline

Consistency: The same preprocessing steps (e.g., scaling) are automatically applied to both the training and test data.

you might accidentally forget to apply scaling to your test data after scaling your training data, leading to poor model performance.

With a pipeline, you define the scaling step once, and it’s guaranteed to be applied consistently to all data.

Efficiency: Reduces repetitive code and potential for errors by automating preprocessing.

Using GridSearchCV with pipelines is straightforward, as the entire pipeline is optimized, ensuring that hyperparameter tuning takes all steps into account.




# Data Leakage 

Preventing Data Leakage
Detailed Explanation:

Data leakage can significantly inflate the model's performance metrics, giving an unrealistic estimate of its real-world performance. It happens when the information from the test set is inadvertently used to train the model.
Pipelines prevent data leakage by ensuring that all preprocessing steps are applied separately to the training and test sets within the cross-validation loop, not before the split.
Example: Data Leakage without Pipelines
Without Pipelines:

Scaling data before splitting it into training and test sets can lead to data leakage. The scaler learns from the entire dataset, including the test set, which should not influence the training process.