In [49]:
#Prepare the Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = pd.read_csv("../Social_Network_Ads.csv")

# Drop 'User ID' column
dataset.drop(columns=['User ID'], inplace=True)

# Perform one-hot encoding on categorical variables
dataset = pd.get_dummies(dataset, dtype=int, drop_first=True)

# Separate features and target
X = dataset.drop(columns=['Purchased'])
y = dataset['Purchased']

# Split into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)


In [50]:
dataset

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


In [51]:
#Define Classification Algorithms and Hyperparameters 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Create pipelines for models requiring scaling
pipelines = {
    'RandomForest': Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())]),
    'SVC': Pipeline([('scaler', StandardScaler()), ('classifier', SVC())]),
    'KNeighbors': Pipeline([('scaler', StandardScaler()), ('classifier', KNeighborsClassifier())]),
    'GaussianNB': Pipeline([('scaler', StandardScaler()), ('classifier', GaussianNB())]),
    'MultinomialNB': Pipeline([('scaler', MinMaxScaler()), ('classifier', MultinomialNB())]),
    'BernoulliNB': Pipeline([('scaler', StandardScaler()), ('classifier', BernoulliNB())]),
    'LogisticRegression': Pipeline([('scaler', StandardScaler()), ('classifier', LogisticRegression())]),
    'PassiveAggressive': Pipeline([('scaler', StandardScaler()), ('classifier', PassiveAggressiveClassifier())])
}

# Define the grid of hyperparameters for each model
params = {
    'RandomForest': {
        'classifier__n_estimators': [10, 50, 100],
        'classifier__max_depth': [None, 10, 20, 30]
    },
    'SVC': {
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf', 'poly']
    },
    'KNeighbors': {
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__metric': ['euclidean', 'manhattan']
    },
    'GaussianNB': {},
    'MultinomialNB': {
        'classifier__alpha': [0.1, 1, 10]
    },
    'BernoulliNB': {
        'classifier__alpha': [0.1, 1, 10]
    },
    'LogisticRegression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']
    },
    'PassiveAggressive': {
        'classifier__C': [0.1, 1, 10],
        'classifier__max_iter': [1000, 2000]
    }
}


In [52]:
#Perform Grid Search with Cross-Validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score,confusion_matrix

results = []
best_estimators = {}

for model_name in pipelines:
    grid_search = GridSearchCV(estimator=pipelines[model_name], param_grid=params[model_name], cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_estimators[model_name] = grid_search.best_estimator_
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model.named_steps['classifier'], "predict_proba") else None
    
    clf_report = classification_report(y_test, y_pred, output_dict=True)
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else 'N/A'
    cm = confusion_matrix(y_test, y_pred)
    
    results.append({
        'Algorithm': model_name,
        'Best Params': grid_search.best_params_,
        'Accuracy': clf_report['accuracy'],
        'Precision': clf_report['weighted avg']['precision'],
        'Recall': clf_report['weighted avg']['recall'],
        'F1-Score': clf_report['weighted avg']['f1-score'],
        'ROC AUC': roc_auc,
        'Confusion Matrix': cm
    })



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
from tabulate import tabulate
import pandas as pd

# Assuming 'results' is a list of dictionaries containing your model evaluation metrics

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Display the DataFrame with reduced width using tabulate
print(tabulate(results_df, headers='keys', tablefmt='grid', showindex=False, numalign="center", stralign='center'))


+--------------------+----------------------------------------------------------------------------------+------------+-------------+----------+------------+--------------------+--------------------+
|     Algorithm      |                                   Best Params                                    |  Accuracy  |  Precision  |  Recall  |  F1-Score  |      ROC AUC       |  Confusion Matrix  |
|    RandomForest    |         {'classifier__max_depth': None, 'classifier__n_estimators': 50}          |  0.925373  |   0.92641   | 0.925373 |  0.925675  | 0.9545018007202881 |      [[79  6]      |
|                    |                                                                                  |            |             |          |            |                    |      [ 4 45]]      |
+--------------------+----------------------------------------------------------------------------------+------------+-------------+----------+------------+--------------------+--------------------+
|    

In [54]:
#Save the Best Model using Pickle
import pickle

# Find the model with the best accuracy
best_model_info = max(results, key=lambda x: x['Accuracy'])
best_model_name = best_model_info['Algorithm']
best_model = best_estimators[best_model_name]

# Save the best model
with open('best_classification_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print(f"Best model saved: {best_model_name}")



Best model saved: RandomForest


In [55]:
import pickle
import pandas as pd

# Load the saved model
with open('best_classification_model.pkl', 'rb') as file:
    best_model = pickle.load(file)


In [56]:
# Example custom input (ensure it matches the original features used for training)
custom_input = pd.DataFrame([[30, 87000, 1]], columns=['Age', 'EstimatedSalary', 'Gender_Male'])

# Transform the custom input using the scaler from the pipeline
scaler = best_model.named_steps['scaler']
custom_input_scaled = scaler.transform(custom_input)

# Predict with the custom input
custom_prediction = best_model.named_steps['classifier'].predict(custom_input_scaled)

print(f"Prediction for custom input: {custom_prediction[0]}")

#Prediction Outcome: The model predicts that, based on the input features (like Age, EstimatedSalary, Gender_Male, etc.), 
# the customer represented by your custom input (Age=30, EstimatedSalary=87000, Gender_Male=1) is predicted as not purchased (assuming 0 represents this class).

Prediction for custom input: 0
