### Import Packages

In [9]:
import math
import pandas as pd
import numpy as np
import requests

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error

import lightgbm
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score

### Import Training Data

In [10]:
train_data = pd.read_csv("C:\\Users\\13996\\PycharmProjects\\AMS518\\Titanic\\train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Import Testing Data

In [11]:
test_data = pd.read_csv("C:\\Users\\13996\\PycharmProjects\\AMS518\\Titanic\\test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## ****Data Cleaning****

### Dealing with Missing Data

##### Count how many missing data in the training data set

In [12]:
train_miss = train_data.isnull().sum()
train_miss

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

##### Count how many missing data in the testing data set

In [13]:
test_miss = test_data.isnull().sum()
test_miss

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

###### We don't drop the missing data, instead we replace all NA's the value that comes directly after it in the same column. Then replace all the remaining na's with their specific value. 

###### For training Data set: Age -> Median, Fare -> drop it (only 1), Cabin -> Mode, Embarked -> drop it (only 2)

###### For testing Data set: Age -> Median, Fare -> drop it (only 1), Cabin -> Mode


### Filling in Missing Data in Age and Cabin

In [14]:
train_data['Age'].fillna(train_data['Age'].median(),inplace=True)
train_data['Cabin'].fillna(train_data['Cabin'].mode()[0],inplace=True)
train_data['Embarked'].fillna(method='bfill', axis=0, inplace=True)

test_data['Age'].fillna(test_data['Age'].median(),inplace=True)
test_data['Cabin'].fillna(test_data['Cabin'].mode()[0],inplace=True)
test_data['Fare'].fillna(method='bfill', axis=0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age'].fillna(train_data['Age'].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Cabin'].fillna(train_data['Cabin'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme

### Checking Missing Data after Filling 

In [15]:
train_missfilled = train_data.isnull().sum()
train_missfilled

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [16]:
test_missfilled = test_data.isnull().sum()
test_missfilled

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Checking Missing Data after droping Nah

In [17]:
train_missna = train_data.isnull().sum()
train_missna

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [18]:
test_missna = test_data.isnull().sum()
test_missna

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

### Checking the Data Statistcis and the Shape

In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [20]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        418 non-null    object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [21]:
#Convert Sex to numerical 
le = LabelEncoder()
train_data['Sex'] = le.fit_transform(train_data['Sex'])
train_data['Embarked'] = le.fit_transform(train_data['Embarked'])
train_data['Cabin'] = le.fit_transform(train_data['Cabin'])
train_data['Age'] = train_data['Age'].astype(np.int64)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int32  
 5   Age          891 non-null    int64  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    int32  
 11  Embarked     891 non-null    int32  
dtypes: float64(1), int32(3), int64(6), object(2)
memory usage: 73.2+ KB


In [22]:
feature_names = ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Cabin", "Embarked"]
train_data = pd.get_dummies(train_data[feature_names])
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,1,22,1,0,47,2
1,1,1,0,38,1,0,81,0
2,1,3,0,26,0,0,47,2
3,1,1,0,35,1,0,55,2
4,0,3,1,35,0,0,47,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27,0,0,47,2
887,1,1,0,19,0,0,30,2
888,0,3,0,28,1,2,47,2
889,1,1,1,26,0,0,60,0


### We use training set to do Cross-validation 4-folds

In [23]:
kf = KFold(n_splits=4, shuffle=True, random_state=42) #break up tp 4 folds
subsets = []
for train_index, test_index in kf.split(train_data):
    subsets.append(train_data.iloc[test_index].reset_index(drop=True)) #assign each folds
subset_1 = subsets[0]
subset_2 = subsets[1]
subset_3 = subsets[2]
subset_4 = subsets[3]

In [24]:
# Set 1
X_in_1 = pd.concat([subset_1, subset_2, subset_3], axis=0) 
X_out_1 = subset_4


# Set 2
X_in_2 = pd.concat([subset_2, subset_3, subset_4], axis=0)
X_out_2 = subset_1


# Set 3 
X_in_3 = pd.concat([subset_1, subset_3, subset_4], axis=0)
X_out_3 = subset_2


# Set 4
X_in_4 = pd.concat([subset_1, subset_2, subset_4], axis=0)
X_out_4 = subset_3


## Preparing x_train, y_train

In [25]:
#set 1
x_train1 = X_in_1.drop(['Survived'], axis=1)
y_train1 = X_in_1['Survived']

x_test1 = X_out_1.drop(['Survived'], axis=1)
y_test1 = X_out_1['Survived']


#set 2
x_train2 = X_in_2.drop(['Survived'], axis=1)
y_train2 = X_in_2['Survived']

x_test2 = X_out_2.drop(['Survived'], axis=1)
y_test2 = X_out_2['Survived']


#set 3
x_train3 = X_in_3.drop(['Survived'], axis=1)
y_train3 = X_in_3['Survived']

x_test3 = X_out_3.drop(['Survived'], axis=1)
y_test3 = X_out_3['Survived']


#set 4
x_train4 = X_in_4.drop(['Survived'], axis=1)
y_train4 = X_in_4['Survived']

x_test4 = X_out_4.drop(['Survived'], axis=1)
y_test4 = X_out_4['Survived']


### Method 1: Logistic Regression

In [38]:
lr_in_acc = []
lr_out_acc = []

for i in range(1,5):
    x_train = eval(f'x_train{i}')
    y_train = eval(f'y_train{i}')
    x_test = eval(f'x_test{i}')
    y_test = eval(f'y_test{i}')

    lr = LogisticRegression(max_iter=10000, random_state=0, n_jobs=4)
    lr.fit(x_train, y_train)

    y_fit = lg.predict(x_train)
    y_pred = lg.predict(x_test)

    acc_train = accuracy_score(y_train, y_fit)
    acc_test = accuracy_score(y_test, y_pred)

    lr_in_acc.append(acc_train)
    lr_out_acc.append(acc_test)
    
    print(f"=== Dataset {i} ===")
    print("Logistic Regression Train Accuracy:", acc_train)
    print("Logistic Regression Accuracy:", acc_test)
    print()
lr_in_avg = (lg_in_acc[0] + lr_in_acc[1] + lr_in_acc[2] + lr_in_acc[3]) / 4
lr_out_avg = (lg_out_acc[0] + lr_out_acc[1] + lr_out_acc[2] + lr_out_acc[3] ) / 4
print("Average Logistic Regression Train Accuracy: " + str(lr_in_avg))
print("Average Logistic Regression Test Accuracy: " + str(lr_out_avg))

=== Dataset 1 ===
Logistic Regression Train Accuracy: 0.8834080717488789
Logistic Regression Accuracy: 0.9099099099099099

=== Dataset 2 ===
Logistic Regression Train Accuracy: 0.8877245508982036
Logistic Regression Accuracy: 0.8968609865470852

=== Dataset 3 ===
Logistic Regression Train Accuracy: 0.8802395209580839
Logistic Regression Accuracy: 0.9192825112107623

=== Dataset 4 ===
Logistic Regression Train Accuracy: 0.9086826347305389
Logistic Regression Accuracy: 0.8340807174887892

Average Logistic Regression Train Accuracy: 0.8963664599053015
Average Logistic Regression Test Accuracy: 0.8663848826404881


### Method 2: Random Forest

In [27]:
rf_in_acc = []
rf_out_acc = []
for i in range(1, 5): 
    x_train = eval(f'x_train{i}')
    y_train = eval(f'y_train{i}')
    x_test = eval(f'x_test{i}')
    y_test = eval(f'y_test{i}')

    rf = RandomForestClassifier(n_estimators=1000, random_state=42,max_depth=5) #build 1000 decision trees
    rf.fit(x_train, y_train)

    y_fit = rf.predict(x_train)
    y_pred = rf.predict(x_test)

    acc_train = accuracy_score(y_train, y_fit)
    acc_test = accuracy_score(y_test, y_pred)
    
    rf_in_acc.append(acc_train)
    rf_out_acc.append(acc_test)

    print(f"=== Dataset {i} ===")
    print("Random Forest Train Accuracy:", acc_train)
    print("Random Forest Test Accuracy:", acc_test)
    print()

rf_in_avg = (rf_in_acc[0] + rf_in_acc[1] + rf_in_acc[2] + rf_in_acc[3]) / 4
rf_out_avg = (rf_out_acc[0] + rf_out_acc[1] + rf_out_acc[2] + rf_out_acc[3] ) / 4
print("Average Random Forest Train Accuracy: " + str(rf_in_avg))
print("Average Random Forest Test Accuracy: " + str(rf_out_avg))

=== Dataset 1 ===
Random Forest Train Accuracy: 0.8684603886397608
Random Forest Test Accuracy: 0.8153153153153153

=== Dataset 2 ===
Random Forest Train Accuracy: 0.8607784431137725
Random Forest Test Accuracy: 0.8026905829596412

=== Dataset 3 ===
Random Forest Train Accuracy: 0.8682634730538922
Random Forest Test Accuracy: 0.8116591928251121

=== Dataset 4 ===
Random Forest Train Accuracy: 0.8577844311377245
Random Forest Test Accuracy: 0.8161434977578476

Average Random Forest Train Accuracy: 0.8638216839862876
Average Random Forest Test Accuracy: 0.8114521472144791


### Method 3: XGboost

In [28]:
xg_in_acc = []
xg_out_acc = []
for i in range(1, 5):  
    x_train = eval(f'x_train{i}')
    y_train = eval(f'y_train{i}')
    x_test = eval(f'x_test{i}')
    y_test = eval(f'y_test{i}')
    
    xg = XGBClassifier(n_estimators=1000, learning_rate=0.01)
    xg.fit(x_train, y_train)

    y_fit = xg.predict(x_train)
    y_pred = xg.predict(x_test)

    acc_train = accuracy_score(y_train, y_fit)
    acc_test = accuracy_score(y_test, y_pred)

    xg_in_acc.append(acc_train)
    xg_out_acc.append(acc_test)
    
    print(f"=== Dataset {i} ===")
    print("XGboost Train Accuracy:", acc_train)
    print("XGboost Test Accuracy:", acc_test)
    print()
xg_in_avg = (xg_in_acc[0] + xg_in_acc[1] + xg_in_acc[2] + xg_in_acc[3]) / 4
xg_out_avg = (xg_out_acc[0] + xg_out_acc[1] + xg_out_acc[2] + xg_out_acc[3] ) / 4
print("Average XGboost Train Accuracy: " + str(xg_in_avg))
print("Average XGboost Test Accuracy: " + str(xg_out_avg))

=== Dataset 1 ===
XGboost Train Accuracy: 0.898355754857997
XGboost Test Accuracy: 0.8288288288288288

=== Dataset 2 ===
XGboost Train Accuracy: 0.9146706586826348
XGboost Test Accuracy: 0.8071748878923767

=== Dataset 3 ===
XGboost Train Accuracy: 0.9116766467065869
XGboost Test Accuracy: 0.820627802690583

=== Dataset 4 ===
XGboost Train Accuracy: 0.907185628742515
XGboost Test Accuracy: 0.8295964125560538

Average XGboost Train Accuracy: 0.9079721722474334
Average XGboost Test Accuracy: 0.8215569829919606


### Method 4: LightGBM

In [39]:
lg_in_acc = []
lg_out_acc = []
for i in range(1, 5):  
    x_train = eval(f'x_train{i}')
    y_train = eval(f'y_train{i}')
    x_test = eval(f'x_test{i}')
    y_test = eval(f'y_test{i}')
    
    lg = LGBMClassifier()
    lg.fit(x_train, y_train)

    y_fit = lg.predict(x_train)
    y_pred = lg.predict(x_test)

    acc_train = accuracy_score(y_train, y_fit)
    acc_test = accuracy_score(y_test, y_pred)

    lg_in_acc.append(acc_train)
    lg_out_acc.append(acc_test)
    
    print(f"=== Dataset {i} ===")
    print("LightGBM Train Accuracy:", acc_train)
    print("LightGBM Test Accuracy:", acc_test)
    print()
lg_in_avg = (lg_in_acc[0] + lg_in_acc[1] + lg_in_acc[2] + lg_in_acc[3]) / 4
lg_out_avg = (lg_out_acc[0] + lg_out_acc[1] + lg_out_acc[2] + lg_out_acc[3] ) / 4
print("Average LightGBM Train Accuracy: " + str(lg_in_avg))
print("Average LightGBM Test Accuracy: " + str(lg_out_avg))

[LightGBM] [Info] Number of positive: 260, number of negative: 409
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 125
[LightGBM] [Info] Number of data points in the train set: 669, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388640 -> initscore=-0.453034
[LightGBM] [Info] Start training from score -0.453034
=== Dataset 1 ===
LightGBM Train Accuracy: 0.9088191330343797
LightGBM Test Accuracy: 0.8153153153153153

[LightGBM] [Info] Number of positive: 253, number of negative: 415
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 126
[LightGBM] [Info] 

In [40]:
print("Average Logistic Regression Train Accuracy: " + str(lg_in_avg))
print("Average Logistic Regression Test Accuracy: " + str(lg_out_avg))

print("Average Random Forest Train Accuracy: " + str(rf_in_avg))
print("Average Random Forest Test Accuracy: " + str(rf_out_avg))

print("Average XGboost Train Accuracy: " + str(xg_in_avg))
print("Average XGboost Test Accuracy: " + str(xg_out_avg))

print("Average LightGBM Train Accuracy: " + str(lg_in_avg))
print("Average LightGBM Test Accuracy: " + str(lg_out_avg))

Average Logistic Regression Train Accuracy: 0.9113365197855411
Average Logistic Regression Test Accuracy: 0.8136942996808467
Average Random Forest Train Accuracy: 0.8638216839862876
Average Random Forest Test Accuracy: 0.8114521472144791
Average XGboost Train Accuracy: 0.9079721722474334
Average XGboost Test Accuracy: 0.8215569829919606
Average LightGBM Train Accuracy: 0.9113365197855411
Average LightGBM Test Accuracy: 0.8136942996808467


### Testing the Result by using the Testing Data

In [34]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,B57 B59 B63 B66,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,B57 B59 B63 B66,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,B57 B59 B63 B66,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,B57 B59 B63 B66,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,B57 B59 B63 B66,S


In [41]:
#Convert to Numerical
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Cabin", "Embarked"]
test_data['Sex'] = le.fit_transform(test_data['Sex'])
test_data['Embarked'] = le.fit_transform(test_data['Embarked'])
test_data['Cabin'] = le.fit_transform(test_data['Cabin'])
test_data['Age'] = test_data['Age'].astype(np.int64)
testing_data =  pd.get_dummies(test_data[features])
testing_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,3,1,34,0,0,15,1
1,3,0,47,1,0,15,2
2,2,1,62,0,0,15,1
3,3,1,27,0,0,15,2
4,3,0,22,1,1,15,2
...,...,...,...,...,...,...,...
413,3,1,27,0,0,15,2
414,1,0,39,0,0,22,0
415,3,1,38,0,0,15,2
416,3,1,27,0,0,15,2


### Result by using Method 1: Logistics Regression

In [43]:
lrpred = lr.predict(testing_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': lrpred})
output.to_csv('lrsubmission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


### Result by using Method 2: Random Forest

In [205]:
rfpred = rf.predict(testing_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': rfpred})
output.to_csv('rfsubmission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


### Result by using Method 3: XGboost

In [206]:
xgpred = rf.predict(testing_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': xgpred})
output.to_csv('xgsubmission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


### Result by using Method 4: LightGBM

In [44]:
lgpred = rf.predict(testing_data)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': lgpred})
output.to_csv('lgsubmission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


## Accuracy result: Logestic Regression: 0.76794; All other three methods: 0.77751