In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = 'test'

In [3]:
data = pd.read_csv('data/' + dataset + '.csv')
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Remove unnecessary features

### Name

In [4]:
# Names are probably not useful for predicting the survival of a person so we'll remove that column
del data['Name']
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,330911,7.8292,,Q
1,893,3,female,47.0,1,0,363272,7.0,,S
2,894,2,male,62.0,0,0,240276,9.6875,,Q
3,895,3,male,27.0,0,0,315154,8.6625,,S
4,896,3,female,22.0,1,1,3101298,12.2875,,S


### Ticket

In [5]:
# Ticket feature seems somewhat inconsistent and is probably not too important so we can go ahead and remove it as well
del data['Ticket']
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,7.8292,,Q
1,893,3,female,47.0,1,0,7.0,,S
2,894,2,male,62.0,0,0,9.6875,,Q
3,895,3,male,27.0,0,0,8.6625,,S
4,896,3,female,22.0,1,1,12.2875,,S


## Checking for missing values

In [6]:
# Age, Cabin, Embarked, and Fare have missing values
data.isnull().sum()

PassengerId      0
Pclass           0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin          327
Embarked         0
dtype: int64

### Fare

In [7]:
# Find missing values for fare
fare_null = data['Fare'].isnull()
data[fare_null]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
152,1044,3,male,60.5,0,0,,,S


In [8]:
# Replace missing values for fare with the mean of all fares
data['Fare'] = data['Fare'].replace(np.NaN, data['Fare'].mean())

### Embarked

In [9]:
# Find missing values for embarked
embarked_null = data['Embarked'].isnull()
data[embarked_null]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked


In [10]:
# Since it's categorical, we can use the most frequent embarkation place (S) to fill the values
# (alternatively, we could have decided to just remove these two rows)
data['Embarked'].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [11]:
# S (Southampton) is the most frequent
data['Embarked'] = data['Embarked'].fillna('S')
data[embarked_null]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked


### Cabin

In [12]:
# There are too many missing values (687 out of 891), it's better to just ignore this feature and remove the column
del data['Cabin']
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


### Age

In [13]:
# Since age is a continuous feature, we'll use linear regression to try to predict the age of missing ages using the non-missing ones
# Taken from: https://towardsdatascience.com/7-ways-to-handle-missing-values-in-machine-learning-1a6326adf79e
from sklearn.linear_model import LinearRegression

if dataset == "train":
    train_data = data[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Fare", "Age"]]
else:
    train_data = data[["Pclass", "Sex", "SibSp", "Parch", "Fare", "Age"]]

train_data["Sex"] = [1 if x == "male" else 0 for x in data["Sex"]]

test_data = train_data[train_data["Age"].isnull()]
train_data.dropna(inplace=True)

y_train = train_data["Age"]
X_train = train_data.drop("Age", axis=1)
X_test = test_data.drop("Age", axis=1)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Sex"] = [1 if x == "male" else 0 for x in data["Sex"]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.dropna(inplace=True)


In [14]:
# Not the best score, but we'll use linear regression at least for now
model.score(X_train, y_train)

0.2628667579150338

In [15]:
# Some of the predicted ages are negative values
neg_pred = y_pred <= 0
y_pred[neg_pred]

array([], dtype=float64)

In [16]:
# We could replace the negative values zero values
y_pred[neg_pred] = 0
y_pred[neg_pred]

array([], dtype=float64)

In [17]:
# Before where some age values are null
null_age = data['Age'].isnull()
data[null_age].head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
10,902,3,male,,0,0,7.8958,S
22,914,1,female,,0,0,31.6833,S
29,921,3,male,,2,0,21.6792,C
33,925,3,female,,1,2,23.45,S
36,928,3,female,,0,0,8.05,S


In [18]:
# After filling age null values with linear regression predictions
data.loc[null_age, 'Age'] = y_pred
data[null_age].head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
10,902,3,male,24.913077,0,0,7.8958,S
22,914,1,female,38.196912,0,0,31.6833,S
29,921,3,male,22.473566,2,0,21.6792,C
33,925,3,female,20.351254,1,2,23.45,S
36,928,3,female,23.182586,0,0,8.05,S


In [19]:
# Check if there are any null values left
data.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

## Convert categorical values to numerical

In [20]:
# Numerical: Pclass, Age, SibSp, Parch, and Fare
# Categorical: Sex, and Embarked
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


### Sex

In [21]:
# Since it's binary, we can just replace female with 0 and male with 1
data["Sex"] = [1 if x == "male" else 0 for x in data["Sex"]]
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,Q
1,893,3,0,47.0,1,0,7.0,S
2,894,2,1,62.0,0,0,9.6875,Q
3,895,3,1,27.0,0,0,8.6625,S
4,896,3,0,22.0,1,1,12.2875,S


### Embarked

In [22]:
# Since this feature is multicategorical, we need to use one-hot encoding
data = pd.get_dummies(data, columns=['Embarked'])
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,892,3,1,34.5,0,0,7.8292,0,1,0
1,893,3,0,47.0,1,0,7.0,0,0,1
2,894,2,1,62.0,0,0,9.6875,0,1,0
3,895,3,1,27.0,0,0,8.6625,0,0,1
4,896,3,0,22.0,1,1,12.2875,0,0,1


## Save new preprocessed dataset

In [23]:
data

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,892,3,1,34.500000,0,0,7.8292,0,1,0
1,893,3,0,47.000000,1,0,7.0000,0,0,1
2,894,2,1,62.000000,0,0,9.6875,0,1,0
3,895,3,1,27.000000,0,0,8.6625,0,0,1
4,896,3,0,22.000000,1,1,12.2875,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,1305,3,1,24.917455,0,0,8.0500,0,0,1
414,1306,1,0,39.000000,0,0,108.9000,1,0,0
415,1307,3,1,38.500000,0,0,7.2500,0,0,1
416,1308,3,1,24.917455,0,0,8.0500,0,0,1


In [24]:
# Save new preprocessed training dataset
data.to_csv('data/preprocessed_' + dataset + '.csv', index=False)