In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')



Data Pre-processing

In [103]:
# missing values
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)

# categorical features
label_sex = LabelEncoder()
train_data['Sex'] = label_sex.fit_transform(train_data['Sex'])  # 0 for male, 1 for female
train_data = pd.get_dummies(train_data, columns=['Embarked'], drop_first=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the

Train Model

In [104]:

# features

# JUSTIFICATION:
# The name column is dropped
# Sex feature is kept due to the historical context of the Titanic disaster where women were prioitized for boarding the lifeboats, and dropping it caused a decrease in the accuracy of the model (0.81-->0.72)
# Age and Family size features slighly increased the accuracy of the model
# Emarked feature was kept, but it didn't have a significant impact on the accuracy of the model

features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked_Q', 'Embarked_S',
            'SibSp', 'Parch']
X = train_data[features]
y = train_data['Survived']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# logistic regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)

# accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation accuracy: {accuracy:.2f}')


# NOTE: also tried to tune the hyperparameters of the model by using sklearn package GridSearchCv, but it didn't have a significant impact on the accuracy of the model

Validation accuracy: 0.84
Best parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Validation accuracy after hyperparameter tuning: 0.86


240 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yuewenyyy/Applied Machine Learning/Homework1/myenv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yuewenyyy/Applied Machine Learning/Homework1/myenv/lib/python3.9/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/yuewenyyy/Applied Machine Learning/Homework1/myenv/lib/python3.9/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/yuewenyyy/

Test

In [105]:
# pre-process the test data 
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
test_data['Sex'] = label_sex.transform(test_data['Sex'])  # Use the label encoder from train data
test_data = pd.get_dummies(test_data, columns=['Embarked'], drop_first=True)


X_test = test_data[features]
test_predictions = logreg_optimized.predict(X_test)

# submission file
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_predictions})
submission.to_csv('submission.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
