# Grid Search

- Titanic Dataset

In [3]:
import pandas as pd 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

### Import necessary libraries

In [4]:
titanic_data=pd.read_csv("c:\\Users\\Administrator\\Downloads\\titanic-2.csv")
titanic_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Preprocess the dataset

In [5]:
# Drop "Name","Ticket" and "Cabin" columns

titanic_data=titanic_data.drop(["Name","Ticket","Cabin"],axis=1)

In [6]:
# Handle missing values (e.g., fill missing age with mean age )
titanic_data["Age"].fillna(titanic_data["Age"].mean(),inplace=True)
titanic_data["Embarked"].fillna(titanic_data["Embarked"].mode()[0],inplace=True)

In [7]:
Label_Encoder=LabelEncoder()
titanic_data['Sex']=Label_Encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked']=Label_Encoder.fit_transform(titanic_data['Embarked'])

In [8]:
# Define Feature (X) and Traget (y)

X=titanic_data.drop('Survived',axis=1)
y=titanic_data['Survived']

In [9]:
# Split the data into training and testing dataset

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state= 98)

In [10]:
# Create a decision tree classifier 
clf= DecisionTreeClassifier(random_state=45)


### Hyperparameter grid to search

In [11]:
param_grid= {
    'criterion':['gini','entropy'],
    'max_depth':[None,10,20,30,40,50],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4]
    
}

In [13]:
# Perform Grid Search with cross-validation (e.g., K=5)

grid_search=GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train,y_train)

In [14]:
# Get the best hyperparameters 

best_params = grid_search.best_params_

In [15]:
best_params

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 10}

In [16]:
# train decision tree classifier with best hyperparameter

best_clf=DecisionTreeClassifier(random_state=98,**best_params)
best_clf.fit(X_train,y_train)


In [17]:
# evaluate the model on the test data
y_pred=best_clf.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)

In [18]:
# Print the best hyperparameters and model accuracy

print(f"Best Hyperparameters : {best_params}")
print(f"Model accuracy on Test Data : {accuracy:.2f}")

Best Hyperparameters : {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Model accuracy on Test Data : 0.73


In [19]:
from sklearn.model_selection import RandomizedSearchCV

In [20]:
random_param= {
    'criterion':['gini','entropy'],
    'max_depth':[None,10,20,30,40,50],
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4]
    
}

In [21]:
random_search=RandomizedSearchCV(estimator=clf, param_distributions=random_param,cv=5,scoring='accuracy',n_jobs=-1)
random_search.fit(X_train,y_train)

In [22]:
random_best_param=random_search.best_params_

In [23]:
random_best_param

{'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_depth': 30,
 'criterion': 'entropy'}

In [24]:
# train decision tree classifier with the best hyperparameters 
best_random_clf=DecisionTreeClassifier(random_state=98, **random_best_param)
best_random_clf.fit(X_train,y_train)

In [25]:
y_random_pred=best_random_clf.predict(X_test)
accuracy_ran=accuracy_score(y_test,y_random_pred)

In [27]:
# Print the best hyperparameters and model accuracy
print(f"Best Hyperparameters: {random_best_param}")
print(f"Model Accuracy on Test Data: {accuracy_ran:.2f}")

Best Hyperparameters: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'entropy'}
Model Accuracy on Test Data: 0.73


In [29]:
# Evaluate the model on the test set

y_pred_ran= random_search.predict(X_test)
accuracy_ran= accuracy_score(y_test,y_pred_ran)

In [30]:
# Print the best hyperparameters and model accuracy
print(f"Best Hyperparameters: {random_best_param}")
print(f"Model Accuracy on Test Data: {accuracy_ran:.2f}")

Best Hyperparameters: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'entropy'}
Model Accuracy on Test Data: 0.72
