In [1]:
# Importing Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [2]:
# 1. Loading dataset
data = pd.read_csv("Titanic-Dataset.csv")
print(data.head())
print(data.info()) # Checking for null values and data types

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

In [3]:
#2. Data Cleaning
# Handling Missing Values
# Imputing missing "Age" with median, "Embarked" with most frequent value, and dropping rows with missing "Cabin"
imputer_age = SimpleImputer(strategy="median")
data["Age"] = imputer_age.fit_transform(data[["Age"]])

imputer_embarked = SimpleImputer(strategy="most_frequent")
data["Embarked"] = imputer_embarked.fit_transform(data[["Embarked"]]).ravel()

# Dropping the "Cabin" column due to missing values
data.drop(columns=["Cabin"], inplace=True)

# Dropping any remaining rows with null values
data.dropna(inplace=True)

In [4]:
# 3. Feature Engineering and Encoding
# Converting categorical features into numeric
data = pd.get_dummies(data, columns=["Sex","Embarked"],drop_first=True)

#Dropping unnecessary columns like "Pclass", "Parch", and "SibSp"
data.drop(columns=["Pclass", "Parch", "SibSp"])

Unnamed: 0,PassengerId,Survived,Name,Age,Ticket,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.2500,True,False,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,False,False,False
2,3,1,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.9250,False,False,True
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1000,False,False,True
4,5,0,"Allen, Mr. William Henry",35.0,373450,8.0500,True,False,True
...,...,...,...,...,...,...,...,...,...
886,887,0,"Montvila, Rev. Juozas",27.0,211536,13.0000,True,False,True
887,888,1,"Graham, Miss. Margaret Edith",19.0,112053,30.0000,False,False,True
888,889,0,"Johnston, Miss. Catherine Helen ""Carrie""",28.0,W./C. 6607,23.4500,False,False,True
889,890,1,"Behr, Mr. Karl Howell",26.0,111369,30.0000,True,False,False


In [5]:
print(data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [13]:
# Dropping non numerical columns for Normalization

columns_to_drop = ["Sex_male", "Embarked_Q", "Embarked_S"]
data = data.drop(columns=columns_to_drop)
print(data)

KeyError: "['Sex_male', 'Embarked_Q', 'Embarked_S'] not found in axis"

In [15]:
print(data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Fare'],
      dtype='object')


In [16]:
columns_to_drop = ["Name"]
data = data.drop(columns=columns_to_drop)
print(data)

     PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare
0              1         0       3  22.0      1      0   7.2500
1              2         1       1  38.0      1      0  71.2833
2              3         1       3  26.0      0      0   7.9250
3              4         1       1  35.0      1      0  53.1000
4              5         0       3  35.0      0      0   8.0500
..           ...       ...     ...   ...    ...    ...      ...
886          887         0       2  27.0      0      0  13.0000
887          888         1       1  19.0      0      0  30.0000
888          889         0       3  28.0      1      2  23.4500
889          890         1       1  26.0      0      0  30.0000
890          891         0       3  32.0      0      0   7.7500

[891 rows x 7 columns]


In [19]:
# 4. Normalization 
# Separating features and targets
x = data.drop(columns="Survived")
y = data["Survived"]

# Ensuring the features are Standardized
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [25]:
# 5. Feature Extraction

selector = SelectKBest(f_classif, k=5)
x_new = selector.fit_transform(x, y)

In [23]:
# 6. Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.3, random_state=42)

In [24]:
# 7. Model Training and Testing
# Logistic Regression model for classification
model = LogisticRegression()
model.fit(x_train, y_train)

# Predicting and Evaluating the chosen model
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7238805970149254
              precision    recall  f1-score   support

           0       0.71      0.90      0.79       157
           1       0.78      0.47      0.58       111

    accuracy                           0.72       268
   macro avg       0.74      0.69      0.69       268
weighted avg       0.74      0.72      0.71       268

