# Feature Enineering and model tuning

**Objective:**

Enhance logistic regression model created in day 3 by engineering new features and tuning hyperparameters 
to improve its performance on the Titanic survival prediction task. Evaluate the impact using the classification report.

In [41]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [5]:
def wrangle(filepath):
    df = pd.read_csv(filepath)

    return df

In [29]:
df = wrangle(r"C:\Users\User\Desktop\100DayOfCode\Titanic_clean.csv")

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [31]:
df = df.drop(columns = "Unnamed: 0")

In [32]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [58]:
# To ensure original dataset is unchanged
df_clean = df.copy()
df_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [59]:
# Feature Engineering
df_clean["Family_size"] = df_clean["SibSp"] + df_clean["Parch"] + 1
df_clean["Title"] = df_clean["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)
df_clean.drop(["SibSp", "Parch", "Name"], axis = 1, inplace = True)

In [62]:
df_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Ticket,Fare,Family_size,Sex_male,Embarked_Q,Embarked_S,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,1,0,3,22.0,A/5 21171,7.25,2,1,0,1,...,0,0,0,0,0,1,0,0,0,0
1,2,1,1,38.0,PC 17599,71.2833,2,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,1,3,26.0,STON/O2. 3101282,7.925,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,4,1,1,35.0,113803,53.1,2,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,5,0,3,35.0,373450,8.05,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [61]:
# Data preprocessing
# Encode categorical variaables
categorical_cols = ["Sex", "Embarked", "Title"]
df_clean = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)
# convert boolean to int
for col in df_clean.columns:
    if df_clean[col].dtype == "bool":
        df_clean[col] = df_clean[col].astype(int)


In [67]:
features = [col for col in df_clean.columns if col not in["PassengerId","Survived","Ticket"]]
X = df_clean[features]
y = df_clean["Survived"]

# Scale Numerical features
num_cols = ["Pclass", "Age", "Fare", "Family_size"]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Train test split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = scaler.fit_transform(X[num_cols])


**param_grid = {'C': [0.01, 0.1, 1, 10, 100]}**

Defines the hyperparameter grid to search. Here, it specifies values for the C parameter of LogisticRegression.

**Parameter C:**
                                                                    
C is the inverse of the regularization strength in logistic regression. It controls the trade-off between fitting the training data well and keeping the model simple (preventing overfitting).

Smaller C (e.g., 0.01): Stronger regularization, which penalizes large coefficients more, leading to a simpler model that may underfit.

Larger C (e.g., 100): Weaker regularization, allowing the model to fit the training data more closely, which may lead to overfitting.

Values: The list [0.01, 0.1, 1, 10, 100] provides a range of regularization strengths to test, from weak to strong.





In [70]:
# Hyper parameter tuning
param_grid = {"C": [0.01,.1, 1, 10, 100]}
model = LogisticRegression(max_iter = 500)
grid_search = GridSearchCV(model, param_grid,cv= 5, scoring ="accuracy")
grid_search.fit(X_train, y_train)

In [71]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model: {grid_search.best_estimator_} ")
print(f"Accuracy: {accuracy}")
print("Classification Report: \n", classification_report(y_test, y_pred))

Best Model: LogisticRegression(C=10, max_iter=500) 
Accuracy: 0.8156424581005587
Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.84      0.84       105
           1       0.77      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

