## GridSearch with BestModel fine tuning_Fully automated

In [156]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [158]:
#Read the CSV file
data = pd.read_csv("Social_Network_Ads.csv")
data.head(5)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [160]:
#Print the shape of data
print("No of rows :",data.shape[0])
print("No of columns",data.shape[1])


No of rows : 400
No of columns 5


In [162]:
#Delete the unwanted feature:
data = data.drop(['User ID'], axis=1)
data.head(5)

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [164]:
#Since we have one object value ('Gender') - convert to numerical value
df = pd.get_dummies(data,drop_first = True, dtype=int)
df.head(5)

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [166]:
# split the data into X and Y
independent = df.drop(columns=['Purchased'])
dependent = df['Purchased']

In [168]:
#Get the count of each value in parchesed feature
dependent.value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [170]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [172]:
# Models dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier()
}

best_model_name = None
best_model = None
best_accuracy = 0

print("Model Evaluation:\n")
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    clf_acc = classification_report(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print("classification_report :",clf_acc)
    
    # Save best model
    if acc > best_accuracy:
        best_accuracy = acc
        best_model_name = name
        best_model = model

print(f"\n Best Model: {best_model_name} with Accuracy: {best_accuracy:.4f}")


Model Evaluation:

Logistic Regression Accuracy: 0.8875
classification_report :               precision    recall  f1-score   support

           0       0.88      0.96      0.92        52
           1       0.91      0.75      0.82        28

    accuracy                           0.89        80
   macro avg       0.90      0.86      0.87        80
weighted avg       0.89      0.89      0.88        80

Decision Tree Accuracy: 0.8375
classification_report :               precision    recall  f1-score   support

           0       0.87      0.88      0.88        52
           1       0.78      0.75      0.76        28

    accuracy                           0.84        80
   macro avg       0.82      0.82      0.82        80
weighted avg       0.84      0.84      0.84        80

SVM Accuracy: 0.9250
classification_report :               precision    recall  f1-score   support

           0       0.96      0.92      0.94        52
           1       0.87      0.93      0.90        28

  

In [174]:
from sklearn.model_selection import GridSearchCV

param_grid = {}

if best_model_name == "Logistic Regression":
    param_grid = {
        'C': [0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs']
    }

elif best_model_name == "Decision Tree":
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    }

elif best_model_name == "SVM":
    param_grid = {
        'C': [0.1, 1, 10,15,20],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    }

elif best_model_name == "Random Forest":
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }


In [176]:
grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid,refit=True,
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [178]:
print("\n Best Parameters Found:")
print(grid_search.best_params_)

print("\n Best Cross-Validation Accuracy:")
print(grid_search.best_score_)

# Evaluate on test set
best_tuned_model = grid_search.best_estimator_
y_pred_tuned = best_tuned_model.predict(X_test_scaled)
print("\n Test Set Accuracy After Tuning:")
print(accuracy_score(y_test, y_pred_tuned))



 Best Parameters Found:
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

 Best Cross-Validation Accuracy:
0.9125

 Test Set Accuracy After Tuning:
0.925


In [180]:
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_tuned))



Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94        52
           1       0.87      0.93      0.90        28

    accuracy                           0.93        80
   macro avg       0.91      0.93      0.92        80
weighted avg       0.93      0.93      0.93        80



In [182]:
result1 = grid_search.predict([[27, 57000, 1]])


In [184]:
result

array([1], dtype=int64)

In [186]:
## Dump into pickle file:
import pickle

In [188]:
filename = "scaler.sav" 
pickle.dump(scaler,open(filename, 'wb'))

In [190]:
filename = "finilized_classified_gridModel.sav"

In [192]:
pickle.dump(best_tuned_model,open(filename,'wb'))