In [24]:
import pandas as pd
import seaborn as sns
from scipy import stats
import numpy as np
import imblearn
import statsmodels.api as sm
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model 

In [3]:
train = pd.read_csv('train.csv')
test_x = pd.read_csv('test.csv')
train = train.dropna(subset=['Embarked'])

train_x = train.drop(columns='Survived')
train_y = train.Survived
test_x.loc[152, 'Fare'] = np.mean(test_x.loc[:, 'Fare'])

# Data exploration

| Variable | Definition | Key |
| --- | --- | --- |
| Survival | Did the person survive? | 0 = No, 1 = Yes|
| Pclass | Ticket class | 1 = 1st Class, 2 = Middle Class, 3 = Lower Class |
| Sex | Gender of the person | Male or Female |
| Sibsp | Number of sibling / spouses on Titanic | Numeric |
| Parch | Number of parents / children on Titanic | Numeric |
| Ticket | Ticket number | Alpha-numeric |
| Fare | Ticket fare | Numeric |
| Cabin | Cabin number | Alpha-numeric |
| Embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |

In [171]:
train_x.head()
# Columns "PassengerId" and "Ticket" are columns that might be unique identifiers. Makes good sense to drop them.

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [172]:
train_x.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [173]:
train_x.info()

#From here we can see that Cabin has very little information. I think I will drop it right away.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


# Attempt 1 with Logistic Regression: Accuracy 0.76794

Attempt details

In [418]:
a1_train = train_x.copy()

In [419]:
a1_train = a1_train.drop(columns= ["PassengerId",'Name','Ticket','Cabin'])

# Dropping passengerId and Ticket because they are unique identifiers, would not contribute to the predictive power of the model
# Dropped Cabin because of missing information
# Dropped name this time round for this attempt

In [238]:
a1_train['EmbarkedS'] = np.where(a1_train['Embarked'] == 'S', 1, 0)
a1_train['EmbarkedC'] = np.where(a1_train['Embarked'] == 'C', 1, 0)
a1_train['EmbarkedQ'] = np.where(a1_train['Embarked'] == 'Q', 1, 0)


# Automatically filled the NA fields with 0

In [239]:
a1_train['male'] = np.where(a1_train['Sex'] == 'male', 1, 0)
a1_train['female'] = np.where(a1_train['Sex'] == 'female', 1, 0)

In [240]:
a1_train['Pclass_Upper'] = np.where(a1_train['Pclass'] == 1, 1, 0)
a1_train['Pclass_Middle'] = np.where(a1_train['Pclass'] == 2, 1, 0)
a1_train['Pclass_Lower'] = np.where(a1_train['Pclass'] == 3, 1, 0)

In [241]:
a1_train.drop(columns= ['Sex','Embarked','Fare','Pclass'],inplace=True)

# I feel that Fare and Pclass are related, hence I'll drop Fare for now.
# Also dropping Sex, Pclass and Embarked here as I created dummy variables for them already

In [242]:
a1_train.head()

Unnamed: 0,Age,SibSp,Parch,EmbarkedS,EmbarkedC,EmbarkedQ,male,female,Pclass_Upper,Pclass_Middle,Pclass_Lower
0,22.0,1,0,1,0,0,1,0,0,0,1
1,38.0,1,0,0,1,0,0,1,1,0,0
2,26.0,0,0,1,0,0,0,1,0,0,1
3,35.0,1,0,1,0,0,0,1,1,0,0
4,35.0,0,0,1,0,0,1,0,0,0,1


In [243]:
a1_train.info()

# We have NAN values in age. In this attempt I will just drop them.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            714 non-null    float64
 1   SibSp          891 non-null    int64  
 2   Parch          891 non-null    int64  
 3   EmbarkedS      891 non-null    int32  
 4   EmbarkedC      891 non-null    int32  
 5   EmbarkedQ      891 non-null    int32  
 6   male           891 non-null    int32  
 7   female         891 non-null    int32  
 8   Pclass_Upper   891 non-null    int32  
 9   Pclass_Middle  891 non-null    int32  
 10  Pclass_Lower   891 non-null    int32  
dtypes: float64(1), int32(8), int64(2)
memory usage: 48.9 KB


In [244]:
# Can't simply drop the NANs of Age.
# This attempt is to try and predict with Linear Regression, using all variables
from sklearn import linear_model 

a1_linreg = a1_train.copy()
linreg_test_x = a1_linreg.loc[ pd.isna(a1_linreg['Age']),:]
linreg_test_x = linreg_test_x.drop(columns='Age')

linreg_train = a1_linreg.loc[ [not i for i in pd.isna(a1_linreg['Age'])], :  ]
linreg_train_y = linreg_train.Age
linreg_train_x = linreg_train.drop(columns= 'Age')

In [245]:
regr = linear_model.LinearRegression()
regr.fit(linreg_train_x,linreg_train_y)
age_pred = regr.predict(linreg_test_x)
age_pred = list(map(lambda x: max(0,x) , age_pred))
age_pred = list(map(lambda x : round(x,0), age_pred))

In [246]:
linreg_test = linreg_test_x.copy()
linreg_test['Age'] = age_pred

In [247]:
testframe = linreg_train.append(linreg_test)

In [38]:
## Function to clean data

def a1_clean(d):
    data = d.copy()
    data = data.drop(columns= ['Name','Ticket','Cabin'])
    data['EmbarkedS'] = np.where(data['Embarked'] == 'S', 1, 0)
    data['EmbarkedC'] = np.where(data['Embarked'] == 'C', 1, 0)
    data['EmbarkedQ'] = np.where(data['Embarked'] == 'Q', 1, 0)
    
    data['Sex'] = np.where(data['Sex'] == 'male', 1, 0)
    
    data['Pclass_Upper'] = np.where(data['Pclass'] == 1, 1, 0)
    data['Pclass_Middle'] = np.where(data['Pclass'] == 2, 1, 0)
    data['Pclass_Lower'] = np.where(data['Pclass'] == 3, 1, 0)
    data.drop(columns= ['Embarked','Pclass','EmbarkedQ','Pclass_Lower'],inplace=True)
    
    ## Fill NaN age with regression
    linreg = data.copy()
    linreg_test_x = linreg.loc[ pd.isna(linreg['Age']),:]
    linreg_test_x = linreg_test_x.drop(columns='Age')
    
    linreg_train = linreg.loc[ [not i for i in pd.isna(linreg['Age'])], :  ]
    linreg_train_y = linreg_train.Age
    linreg_train_x = linreg_train.drop(columns= 'Age')
    
    regr = linear_model.LinearRegression()
    regr.fit(linreg_train_x,linreg_train_y)
    age_pred = regr.predict(linreg_test_x)
    age_pred = list(map(lambda x: max(0,x) , age_pred))
    age_pred = list(map(lambda x : round(x,0), age_pred))
    
    linreg_test = linreg_test_x.copy()
    linreg_test['Age'] = age_pred
    
    frame = linreg_train.append(linreg_test)
    
    ss = StandardScaler()
    fare = np.array(frame['Fare']).reshape(-1, 1)
    fare = ss.fit_transform(fare).reshape(1, -1)
    frame['Fare'] = fare[0]
    
    frame = frame.sort_values('PassengerId')
    return frame

In [39]:
train_dataset_a1 = a1_clean(train_x)
test_dataset_a1 = a1_clean(test_x)

In [40]:
test_dataset_a1

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,EmbarkedS,EmbarkedC,Pclass_Upper,Pclass_Middle
0,892,1,34.5,0,0,-0.498407,0,0,0,0
1,893,0,47.0,1,0,-0.513274,1,0,0,0
2,894,1,62.0,0,0,-0.465088,0,0,0,1
3,895,1,27.0,0,0,-0.483466,1,0,0,0
4,896,0,22.0,1,1,-0.418471,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
413,1305,1,24.0,0,0,-0.494448,1,0,0,0
414,1306,0,39.0,0,0,1.313753,0,1,1,0
415,1307,1,38.5,0,0,-0.508792,1,0,0,0
416,1308,1,24.0,0,0,-0.494448,1,0,0,0


In [41]:
a1_train_x = train_dataset_a1.drop(columns='PassengerId')
a1_train_y = train_y
a1_test_x = test_dataset_a1.drop(columns='PassengerId')

In [42]:
testframe = a1_train_x.copy()
testframe['Survived'] = a1_train_y

In [45]:
logreg = LogisticRegression(max_iter = 500)
logreg.fit(a1_train_x, a1_train_y)
y_pred = logreg.predict(a1_test_x)
logreg_result = pd.DataFrame(y_pred, test_x['PassengerId'])
logreg_result = logreg_result.reset_index()
logreg_result.columns = ['PassengerId', 'Survived']

In [44]:
logreg_result.to_csv('attempt1_logreg.csv', index = False)

In [361]:
# Accuracy on Kaggle : 0.76794
# To try: Inlcude Pclass_Lower, EmbarkedQ.

########### End of Attempt 1 ####################