#Data Preparation

Read the Titanic.csv file into a DataFrame using pandas.
Check for any missing values and handle them appropriately.
Drop columns that won't be used in the model (e.g., Ticket, Name, Cabin).

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Phase2/Week5/Day2/Copy of Titanic.csv")
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:
df.drop(['Age','Cabin','Ticket'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,0,0,8.05,S


In [None]:
df.duplicated().sum()

np.int64(0)

#Encoding Categorical Variables


Encode the Sex column (e.g., Male = 1, Female = 0).
Use LabelEncoder or OneHotEncoder for categorical variables

In [None]:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Embarked'] = le.fit_transform(df['Embarked'])


#Create New Features

Extract titles from the Name column (e.g., Mr, Mrs, Miss) and create a new feature called Title.


In [None]:
df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,1,0,7.25,2,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,0,71.2833,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",0,0,0,7.925,2,Miss


Create a FamilySize feature that combines the SibSp (siblings/spouses aboard) and Parch (parents/children aboard) columns.


In [None]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

Create a IsAlone feature that indicates whether a passenger is traveling alone (FamilySize = 1).

In [None]:
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",1,1,0,7.25,2,Mr,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,0,71.2833,0,Mrs,2,0
2,3,1,3,"Heikkinen, Miss. Laina",0,0,0,7.925,2,Miss,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,0,53.1,2,Mrs,2,0
4,5,0,3,"Allen, Mr. William Henry",1,0,0,8.05,2,Mr,1,1


In [None]:
df = df.drop(columns=['Name'])
df['Title'] = le.fit_transform(df['Title'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
 7   Embarked     891 non-null    int64  
 8   Title        891 non-null    int64  
 9   FamilySize   891 non-null    int64  
 10  IsAlone      891 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 76.7 KB


#Preprocess the Data

Split the dataset into features (X) and target (y).
Split the data into training and testing sets using train_test_split.

In [None]:
y = df.Survived
X = df.drop("Survived",axis=1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=1, stratify=y
)

#Define the Objective Function

Create an objective function for Optuna that:
Suggests hyperparameters for a model (e.g., GradientBoostingClassifier).
Trains the model on the training set.
Evaluates the model on the testing set.
Returns the accuracy.

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create and train the Decision Tree model
model = DecisionTreeClassifier(max_depth=2, max_leaf_nodes=8, random_state=42)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the testing set
y_pred = model.predict(X_test)

accuracy=accuracy_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Recall Score:", recall)
print("Precision Score:", precision)
print("F1 Score:", f1)

Accuracy: 0.7555555555555555
Recall Score: 0.6857142857142857
Precision Score: 0.6857142857142857
F1 Score: 0.6857142857142857


In [None]:
import optuna
# Define the objective function for Optuna
def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 7),  # int
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 5, 12),  # int
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),  # categorical
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),  # float
        'random_state': trial.suggest_int('random_state', 1, 100),  # int
    }

    # Create and train the model with suggested hyperparameters
    model = DecisionTreeClassifier(**params)
    model.fit(X_train, y_train)

    # Evaluate the model's performance on the validation set
    predictions = model.predict(X_test)
    f1 = f1_score(y_test, predictions)

    return f1

#Set Up Optuna Study

Create an Optuna study to optimize the objective function.
Run the optimization for a specified number of trials (e.g., 100)

In [None]:
# Create a study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-06-14 13:01:48,319] A new study created in memory with name: no-name-4f2494cf-bac1-49da-8fb6-422179a70f05
[I 2025-06-14 13:01:48,334] Trial 0 finished with value: 0.676923076923077 and parameters: {'max_depth': 6, 'max_leaf_nodes': 8, 'splitter': 'best', 'min_weight_fraction_leaf': 0.21235294473770533, 'random_state': 36}. Best is trial 0 with value: 0.676923076923077.
[I 2025-06-14 13:01:48,352] Trial 1 finished with value: 0.676923076923077 and parameters: {'max_depth': 3, 'max_leaf_nodes': 9, 'splitter': 'random', 'min_weight_fraction_leaf': 0.1481304214260537, 'random_state': 34}. Best is trial 0 with value: 0.676923076923077.
[I 2025-06-14 13:01:48,366] Trial 2 finished with value: 0.676923076923077 and parameters: {'max_depth': 5, 'max_leaf_nodes': 8, 'splitter': 'random', 'min_weight_fraction_leaf': 0.12134641153284981, 'random_state': 28}. Best is trial 0 with value: 0.676923076923077.
[I 2025-06-14 13:01:48,380] Trial 3 finished with value: 0.6764705882352942 and param

#Retrieve and Display Results

Access the best hyperparameters found during the optimization.

In [None]:
# Access the best parameters
best_params = study.best_params
print("Best Parameters:", best_params)

# Access the best score
best_score = study.best_value
print("Best Score:", best_score)

Best Parameters: {'max_depth': 7, 'max_leaf_nodes': 5, 'splitter': 'best', 'min_weight_fraction_leaf': 0.030975538207727193, 'random_state': 56}
Best Score: 0.7058823529411765


Train a final model using these best parameters on the entire training data.
Evaluate this model on the test dataset and display the accuracy.

In [None]:
# Create a new model with the best parameters
final_model = DecisionTreeClassifier(**best_params)

# Fit the model on the entire training data
final_model.fit(X_train, y_train)

# Predict on the test data
y_pred = final_model.predict(X_test)

# Evaluate the model's performance
final_accuracy = accuracy_score(y_test, y_pred)
print("Final Model Accuracy on Test Data:", final_accuracy)

Final Model Accuracy on Test Data: 0.7777777777777778


In [None]:
accuracy=accuracy_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Recall Score:", recall)
print("Precision Score:", precision)
print("F1 Score:", f1)

Accuracy: 0.7777777777777778
Recall Score: 0.6857142857142857
Precision Score: 0.7272727272727273
F1 Score: 0.7058823529411765
