In this notebook we will build and execute a model based on the cleaned data

#### Download cleaned data

In [164]:
import pandas as pd

train = pd.read_csv('../data/processed/train_cleaned.csv')
test = pd.read_csv('../data/processed/test_cleaned.csv')

train.drop(columns=['PassengerId'])



Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,LastName_encoded,Title_encoded,Cabin_letter_encoded
0,0.0,0.841916,male,-0.584644,0.481288,-0.445000,-0.503004,S,0.073352,-0.231990,-0.802416,-0.513694
1,1.0,-1.546098,female,0.598982,0.481288,-0.445000,0.734886,C,0.073352,1.259457,1.425913,1.329952
2,1.0,0.841916,female,-0.288738,-0.479087,-0.445000,-0.489955,S,-0.558346,1.259457,1.095526,-0.513694
3,1.0,-1.546098,female,0.377052,0.481288,-0.445000,0.383367,S,0.073352,0.513733,1.425913,1.329952
4,0.0,0.841916,male,0.377052,-0.479087,-0.445000,-0.487538,S,-0.558346,0.513733,-0.802416,-0.513694
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,-0.352091,male,-0.214761,-0.479087,-0.445000,-0.391845,S,-0.558346,-0.231990,-1.351927,-0.513694
887,1.0,-1.546098,female,-0.806574,-0.479087,-0.445000,-0.063202,S,-0.558346,0.762308,1.095526,2.281800
888,0.0,0.841916,female,-1.761399,0.481288,1.866526,-0.189826,S,1.336749,-0.231990,1.095526,-0.513694
889,1.0,-1.546098,male,-0.288738,-0.479087,-0.445000,-0.063202,C,-0.558346,1.259457,-0.802416,1.329952


In [165]:
numerical_features = ['Pclass',  'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'Title_encoded', 'LastName_encoded', 'Cabin_letter_encoded']
categorical_features = ['Sex', 'Embarked']

In [166]:
missing_values_train = train.isnull().sum()
print("Missing values in train data:")
print(missing_values_train)

Missing values in train data:
PassengerId             0
Survived                0
Pclass                  0
Sex                     0
Age                     0
SibSp                   0
Parch                   0
Fare                    0
Embarked                0
FamilySize              0
LastName_encoded        0
Title_encoded           0
Cabin_letter_encoded    0
dtype: int64


In [167]:
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,LastName_encoded,Title_encoded,Cabin_letter_encoded
0,1,0.0,0.841916,male,-0.584644,0.481288,-0.445,-0.503004,S,0.073352,-0.23199,-0.802416,-0.513694
1,2,1.0,-1.546098,female,0.598982,0.481288,-0.445,0.734886,C,0.073352,1.259457,1.425913,1.329952
2,3,1.0,0.841916,female,-0.288738,-0.479087,-0.445,-0.489955,S,-0.558346,1.259457,1.095526,-0.513694
3,4,1.0,-1.546098,female,0.377052,0.481288,-0.445,0.383367,S,0.073352,0.513733,1.425913,1.329952
4,5,0.0,0.841916,male,0.377052,-0.479087,-0.445,-0.487538,S,-0.558346,0.513733,-0.802416,-0.513694
5,6,0.0,0.841916,male,-0.041783,-0.479087,-0.445,-0.479645,Q,-0.558346,0.265159,-0.802416,-0.513694
6,7,0.0,-1.546098,male,1.782608,-0.479087,-0.445,0.359444,S,-0.558346,-0.23199,-0.802416,2.315228
7,8,0.0,0.841916,male,-2.064177,2.402037,0.710763,-0.235739,S,1.968447,-0.23199,0.664813,-0.513694
8,9,1.0,0.841916,female,-0.214761,-0.479087,1.866526,-0.427932,S,0.705051,0.513733,1.425913,-0.513694
9,10,1.0,-0.352091,female,-1.176457,0.481288,-0.445,-0.061833,C,0.073352,0.513733,1.425913,-0.513694


In [168]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

#test-train split
train, valid = train_test_split(train, test_size=0.2, random_state=42)


X_train = train.drop(['Survived'], axis=1)
y_train = train['Survived']
X_valid = valid.drop(['Survived'], axis=1)
y_valid = valid['Survived']

# Create and fit pipeline
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LogisticRegression(random_state=42))
])

# fit and evaluate linear regression
log_reg_pipeline.fit(X_train, y_train)
preds = log_reg_pipeline.predict(X_valid)

print("\nLogistic Regression Results (Predicting Survival):")
print("Classification Report:")
print(classification_report(y_valid, preds))



Logistic Regression Results (Predicting Survival):
Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       105
         1.0       0.96      0.97      0.97        74

    accuracy                           0.97       179
   macro avg       0.97      0.97      0.97       179
weighted avg       0.97      0.97      0.97       179



In [169]:
missing_values = test.isnull().sum()
print("Missing values in train data:")
print(missing_values)

Missing values in train data:
PassengerId               0
Survived                418
Pclass                    0
Sex                       0
Age                       0
SibSp                     0
Parch                     0
Fare                      0
Embarked                  0
FamilySize                0
LastName_encoded          0
Title_encoded             0
Cabin_letter_encoded      0
dtype: int64


## Output Results

In [170]:
# Read the raw test data to get PassengerId
passenger_ids = test['PassengerId']

# Make predictions
preds = log_reg_pipeline.predict(test).astype(int)

# Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': preds
})

# Save to CSV
submission.to_csv('../outputs/logistic_regression_results.csv', index=False)