In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 1. Load data and explore

In [2]:
tt = pd.read_csv('train.csv')

In [3]:
len(tt)
len(tt.columns)

891

12

In [4]:
tt.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
tt['Cabin']

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [6]:
tt.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
tt.groupby('Sex')['Survived'].sum()
tt.groupby('Sex')['Survived'].sum() / tt.groupby('Sex')['Survived'].count() * 100

Sex
female    233
male      109
Name: Survived, dtype: int64

Sex
female    74.203822
male      18.890815
Name: Survived, dtype: float64

In [8]:
tt.groupby('Pclass')['Survived'].sum()
tt.groupby('Pclass')['Survived'].sum() / tt.groupby('Pclass')['Survived'].count() * 100

Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64

Pclass
1    62.962963
2    47.282609
3    24.236253
Name: Survived, dtype: float64

In [9]:
tt.groupby('Survived')['Fare'].mean()
tt.groupby('Survived')['Fare'].median()

Survived
0    22.117887
1    48.395408
Name: Fare, dtype: float64

Survived
0    10.5
1    26.0
Name: Fare, dtype: float64

In [10]:
tt.groupby('Embarked')['Survived'].sum()
tt.groupby('Embarked')['Survived'].sum() / tt.groupby('Embarked')['Survived'].count() * 100

Embarked
C     93
Q     30
S    217
Name: Survived, dtype: int64

Embarked
C    55.357143
Q    38.961039
S    33.695652
Name: Survived, dtype: float64

## 2. Pre-process

In [11]:
from fancyimpute import IterativeImputer

def process(df, drop_feature):
    df_feature = df.drop(drop_feature, axis=1)
    df_dummy = pd.get_dummies(df_feature, columns=['Sex', 'Embarked'], drop_first=True)
    imputer = IterativeImputer()
    df_processed = df_dummy.copy()
    df_processed['Age'] = imputer.fit_transform(df_processed[['Age']])
    # Round the imputed values to whole numbers if necessary
    df_processed['Age'] = np.round(df_processed['Age'])

    # Convert 'Age' column back to integer type
    df_processed['Age'] = df_processed['Age'].astype(int)
    
    return df_processed

In [12]:
tt_process = tt.drop(['Ticket','Cabin','Name'], axis=1)

# fill 'Embarked' with the model because it only has 2 missing values

df_filled = tt_process.copy()
df_filled['Embarked'].fillna(df_filled['Embarked'].mode().iloc[0], inplace=True)


# Dummy Variable

df_dummy = pd.get_dummies(df_filled, columns=['Sex','Embarked'], drop_first=True)


# Stochastic Regression Imputation for 'Age'

from fancyimpute import IterativeImputer

# Create an IterativeImputer instance
imputer = IterativeImputer()
df = df_dummy.copy()
df['Age'] = imputer.fit_transform(df[['Age']])

# Round the imputed values to whole numbers if necessary
df['Age'] = np.round(df['Age'])

# Convert 'Age' column back to integer type
df['Age'] = df['Age'].astype(int)

In [13]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22,1,0,7.2500,1,0,1
1,2,1,1,38,1,0,71.2833,0,0,0
2,3,1,3,26,0,0,7.9250,0,0,1
3,4,1,1,35,1,0,53.1000,0,0,1
4,5,0,3,35,0,0,8.0500,1,0,1
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27,0,0,13.0000,1,0,1
887,888,1,1,19,0,0,30.0000,0,0,1
888,889,0,3,30,1,2,23.4500,0,0,1
889,890,1,1,26,0,0,30.0000,1,0,0


## 3. Modeling

In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22,1,0,7.25,1,0,1
1,2,1,1,38,1,0,71.2833,0,0,0
2,3,1,3,26,0,0,7.925,0,0,1
3,4,1,1,35,1,0,53.1,0,0,1
4,5,0,3,35,0,0,8.05,1,0,1


In [15]:
# split data

features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
X_train = train_df[features]
y_train = train_df['Survived']

X_test = test_df[features]
y_test = test_df['Survived']

### 3.1 Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

# Initializing and fitting the logistic regression model
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(class_weight='balanced')

In [17]:
y_pred_log = model.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred_log)
print("Accuracy:", accuracy)

Accuracy: 0.8212290502793296


### 3.2 Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
# Initialize the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

RandomForestClassifier(class_weight='balanced', random_state=42)

Accuracy: 0.8156424581005587


### 3.3 Support vector machine

In [21]:
from sklearn.svm import SVC

# Initialize the Support Vector Classifier
clf = SVC(kernel='linear', C=1.0,  random_state=42)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

SVC(kernel='linear', random_state=42)

Accuracy: 0.7821229050279329


### 3.4 XGBoost

In [22]:
from xgboost import XGBClassifier

# Initialize the XGBoost Classifier
clf = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict on the testing data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:}")


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

Accuracy: 0.8268156424581006


### 3.5 Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Gaussian Naive Bayes Classifier
clf = GaussianNB()

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


GaussianNB()

Accuracy: 0.770949720670391


### 3.6 K-nearest neighbers 

In [24]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN Classifier with k=5 (you can change k as needed)
clf = KNeighborsClassifier(n_neighbors=7)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


KNeighborsClassifier(n_neighbors=7)

Accuracy: 0.7262569832402235


### 3.7 Deep learning

In [25]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.optimizers import Adam

# Build the deep learning model
model = Sequential()
model.add(Dense(64, input_dim=len(X_train.columns), activation='relu'))
# model.add(Dropout(0.1))  # Add a dropout layer with a dropout rate of 0.5
# model.add(BatchNormalization())  # Add a BatchNormalization layer
model.add(Dense(32, activation='relu'))
# model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='BinaryFocalCrossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=128, validation_data=(X_test, y_test),verbose=0)

# Evaluate the model on the test data

predict_x=model.predict(X_test) 
# classes_x=np.argmax(predict_x,axis=1)
classes_x=np.round(predict_x)
#y_pred = model.predict_classes(X_test)
accuracy = accuracy_score(y_test, classes_x)
print(f"Accuracy: {accuracy}")


<keras.callbacks.History at 0x7f859de3a860>

Accuracy: 0.8044692737430168


In [112]:
np.unique(classes_x)

array([0., 1.], dtype=float32)

In [147]:
predict_x.shape

(179, 1)

<keras.callbacks.History at 0x7f886fa35550>

Accuracy: 0.5865921787709497


In [153]:
predict_x.shape

(179, 1)

## Testing

In [180]:
test = pd.read_csv('test.csv')

In [166]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [181]:
test_df = process(test, ['Name','Ticket','Cabin'])

In [182]:
test_df

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,892,3,34,0,0,7.8292,1,1,0
1,893,3,47,1,0,7.0000,0,0,1
2,894,2,62,0,0,9.6875,1,1,0
3,895,3,27,0,0,8.6625,1,0,1
4,896,3,22,1,1,12.2875,0,0,1
...,...,...,...,...,...,...,...,...,...
413,1305,3,30,0,0,8.0500,1,0,1
414,1306,1,39,0,0,108.9000,0,0,0
415,1307,3,38,0,0,7.2500,1,0,1
416,1308,3,30,0,0,8.0500,1,0,1


In [183]:
passenger_ID = test_df['PassengerId']

In [184]:
test_1 = test_df.drop('PassengerId',axis=1)

In [196]:
X_test = test_1.copy()

### training on all training data

In [190]:
train_df = df.copy()

In [195]:

X_train = train_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']]
y_train = train_df['Survived']

In [197]:
from xgboost import XGBClassifier

# Initialize the XGBoost Classifier
clf = XGBClassifier(n_estimators=250, learning_rate=0.1, random_state=42)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict on the testing data
y_pred = clf.predict(X_test)


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=250,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

In [204]:
submit = pd.DataFrame({'PassengerId': passenger_ID, 'Survived': y_pred})

In [206]:
submit.to_csv('Titanic_Test.csv', index=False)