# Grid Search

- Titanic Dataset

### Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
import pandas as pd
import numpy as np

### Load the Titanic dataset

In [3]:
titanic_data = pd.read_csv('titanic.csv')  # Replace 'path_to_titanic.csv' with the actual file path

In [4]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Preprocess the dataset

In [5]:
# Drop 'Name,' 'Ticket,' and 'Cabin' columns
titanic_data = titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)


In [6]:
#titanic_data['Age'].fillna
#imputer = SimpleImputer(strategy='mean')
# titanic_data['Age'] = imputer.fit_transform(titanic_data[['Age']])

In [7]:
# Handle missing values (e.g., fill missing ages with the mean age)
titanic_data['Age'].fillna(titanic_data['Age'].mean())
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)


In [8]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64

In [9]:
# Encode categorical variables ('Sex' and 'Embarked')
label_encoder = LabelEncoder()
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = label_encoder.fit_transform(titanic_data['Embarked'])

In [10]:
# Define features (X) and target (y)
X = titanic_data.drop('Survived', axis=1)  # Assuming 'Survived' is the target variable
y = titanic_data['Survived']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Create a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)

### Hyperparameter grid to search

In [13]:
param_gridsss = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [14]:
# Perform Grid Search with cross-validation (e.g., K=5)
grid_search = GridSearchCV(estimator=clf, param_grid=param_gridsss, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [15]:
# Get the best hyperparameters
best_params = grid_search.best_params_

In [16]:
best_params

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 4,
 'min_samples_split': 10}

In [17]:
# Train a Decision Tree classifier with the best hyperparameters
best_clf = DecisionTreeClassifier(random_state=42, **best_params)
best_clf.fit(X_train, y_train)


In [18]:
# Evaluate the model on the test set
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [19]:
# Print the best hyperparameters and model accuracy
print(f"Best Hyperparameters: {best_params}")
print(f"Model Accuracy on Test Data: {accuracy:.2f}")

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Model Accuracy on Test Data: 0.78
