In [35]:
%matplotlib inline

import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt

# vectorization
from sklearn.preprocessing import LabelEncoder

# misc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [2]:
# load train and test data

train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

## 1. Structure of the Data

Size of **training examples**: (891,12) <br/>
Size of **test data**: (418,11)

In [None]:
print(f'Train dataset size: {train.shape}')
train.head()

In [None]:
print(f'Test dataset size: {test.shape}')
test.head()

## 2. Missing Values

In [None]:
def missing_values(df, columns):
    """ Finds number of rows where specified columns are missing values.
    
    Args:
        df:
            The dataframe to be analyzed.
        columns:
            The list of columns of the dataframe.
    
    Returns:
         A dictionary with the columns and the number of values they are missing.
    """
    missing = {}
    
    for column in columns:
        total = df[column].value_counts().sum()
        missing[column] = df.shape[0] - total
    
    return missing

In [None]:
missing = missing_values(train, train.columns)
print(f'Total values that each column should have: {train.shape[0]}')
print(f'Total values that each column is missing:')
missing

## 3. Preprocess

### 3.1 PassengerId

The **PassengerId** column has unique values for each row in the trainning examples. Thus, it won't help in classification.

In [3]:
train = train.drop(['PassengerId'], axis=1)

### 3.2 Name

The **Name** column has unique values for each row in the trainning examples. Thus, it won't help in classification. <br/>
However, we can create an new feature from it, called **Title** which will be the title of each person.

In [4]:
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train = train.drop(['Name'], axis=1)

In [24]:
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test = test.drop(['Name'], axis=1)

### 3.3 Age

The **Age** column is missing a lot of values. My intuition is that the age of a person played an important role in his survival and so we'll fill them using the mean of the ages with respect to the persons title. Also, we'll create ranges of the age so it is discretised.

In [5]:
train['Age'] = train['Age'].fillna(train.groupby('Title')['Age'].transform('median'))
train['Age'] = pd.cut(train['Age'], bins=10)

In [25]:
test['Age'] = test['Age'].fillna(test.groupby('Title')['Age'].transform('median'))
test['Age'] = pd.cut(test['Age'], bins=10)

### 3.4 Ticket

In [6]:
print(f'Ticket column has {len(train.Ticket.unique())} different values.')

Ticket column has 681 different values.


The **Ticket** column has a lot of different values. Specifically, 681/891 different values. So, we'll drop this column.

In [7]:
train = train.drop(['Ticket'], axis=1)

In [26]:
test = test.drop(['Ticket'], axis=1)

### 3.5 Fare

In [8]:
print(f'Fare column has {len(train.Fare.unique())} different values.')

Fare column has 248 different values.


The **Fare** column has a some unique values (248/891). We'll not drop this column, but we'll create ranges, so it becomes categorical.

In [9]:
train['Fare'] = pd.cut(train['Fare'], bins=5)

In [27]:
test['Fare'] = pd.cut(test['Fare'], bins=5)

### 3.6 Embarked

The **Embarked** column has 2 missing values. We'll not remove these rows but we'll fill them with the most frequent value.

In [10]:
# get the most frequent value in 'Embarked' column
val = train['Embarked'].value_counts().idxmax()

# replace null values
train['Embarked'] = train['Embarked'].fillna(value=val)

In [28]:
val = test['Embarked'].value_counts().idxmax()
test['Embarked'] = test['Embarked'].fillna(value=val)

### 3.7 Cabin

**Observations:**
* The **Cabin** column has a lot of missing values. We'll only keep the first letter of the cabin and create a new cabin class, namely 'n' for these rows that have missing values in this column. <br/>

**Note:**
* Creating a new cabin class for those rows that didn't have a value in this column didn't work. Maybe, I can take the mean with respect to the ticket class. I'll assume that the higher the class is, the better the cabin class is.

In [None]:
# # keep the first letter from the cabin
# train['Cabin'] = train['Cabin'].str[:1]

# # fill missing data in "Cabin" column using 'n' (we'll assume that n is a cabin class)
# train['Cabin'] = train['Cabin'].fillna(value='n')

In [11]:
train = train.drop(['Cabin'], axis=1)

In [29]:
test = test.drop(['Cabin'], axis=1)

### 3.8 Alone

The **SibSp** column shows the number of siblings/spouses of a person aboard the Titanic. Similarly, **Parch** shows the number of parents/children of a person aboard the Titanic.

My intuition is that if a person wasn't alone on the Titanic he could have help from his family, or alternatively he could have sacrificed himself/herself to help them (lower chances of survival). So, we'll can combine them in a single feature called **Alone**, which takes 0/1 values, and drop **SibSp** and **Parch**.

In [13]:
train['Alone'] = 1
train.loc[(train['SibSp'] >= 1) | (train['Parch'] >= 1), 'Alone'] = 0

# drop SibSp and Parch
train = train.drop(['SibSp', 'Parch'], axis=1)

In [30]:
test['Alone'] = 1
test.loc[(test['SibSp'] >= 1) | (test['Parch'] >= 1), 'Alone'] = 0
test = test.drop(['SibSp', 'Parch'], axis=1)

In [14]:
print('Training examples after preprocessing:')
train.head()

Training examples after preprocessing:


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Alone
0,0,3,male,"(16.336, 24.294]","(-0.512, 102.466]",S,Mr,0
1,1,1,female,"(32.252, 40.21]","(-0.512, 102.466]",C,Mrs,0
2,1,3,female,"(24.294, 32.252]","(-0.512, 102.466]",S,Miss,1
3,1,1,female,"(32.252, 40.21]","(-0.512, 102.466]",S,Mrs,0
4,0,3,male,"(32.252, 40.21]","(-0.512, 102.466]",S,Mr,1


In [31]:
print('Test data after preprocessing:')
test.head()

Test data after preprocessing:


Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title,Alone
0,892,3,male,"(30.502, 38.085]","(-0.512, 102.466]",Q,Mr,1
1,893,3,female,"(45.668, 53.251]","(-0.512, 102.466]",S,Mrs,0
2,894,2,male,"(60.834, 68.417]","(-0.512, 102.466]",Q,Mr,1
3,895,3,male,"(22.919, 30.502]","(-0.512, 102.466]",S,Mr,1
4,896,3,female,"(15.336, 22.919]","(-0.512, 102.466]",S,Mrs,0


## 4. Vectorize

We'll try categorical encoding vectors.

In [15]:
train_enc_cat = train.apply(LabelEncoder().fit_transform)

In [16]:
print(f'Categorical encoding vector size: {train_enc_cat.shape}')
train_enc_cat.head()

Categorical encoding vector size: (891, 8)


Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Alone
0,0,2,1,2,0,2,12,0
1,1,0,0,4,0,0,13,0
2,1,2,0,3,0,2,9,1
3,1,0,0,4,0,2,13,0
4,0,2,1,4,0,2,12,1


## 5. Fitting Models and Score

In [17]:
y_train = train_enc_cat.Survived
X_train = train_enc_cat.drop(['Survived'], axis=1)

In [18]:
print(f'Train data size: {X_train.shape}')
X_train.head()

Train data size: (891, 7)


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Alone
0,2,1,2,0,2,12,0
1,0,0,4,0,0,13,0
2,2,0,3,0,2,9,1
3,0,0,4,0,2,13,0
4,2,1,4,0,2,12,1


In [19]:
print(f'Train labels size: {y_train.shape}')
y_train.head()

Train labels size: (891,)


0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

### 5.1 Logistic Regression

In [40]:
lr = LogisticRegression()

# cross validation
scores = cross_val_score(lr, X_train, y_train, cv=10)

# accuracy
print(f'Accuracy of Logistic Regression: {scores.mean()}')

Accuracy of Logistic Regression: 0.7845318352059925


### 5.2 Decision Tree

In [42]:
dt = DecisionTreeClassifier()

# cross validation
scores = cross_val_score(dt, X_train, y_train, cv=10)

# accuracy
print(f'Accuracy for Decision Tree: {scores.mean()}')

Accuracy for Decision Tree: 0.802521847690387


### 5.3 Random Forest

In [43]:
# 100 decision tree classifiers
# feture selection with gini
# decision trees have max depth
rf = RandomForestClassifier()

# cross validation
scores = cross_val_score(rf, X_train, y_train, cv=10)

# accuracy
print(f'Accuracy for Random Forest: {scores.mean()}')

Accuracy for Random Forest: 0.7969413233458178


### 5.4 Support Vector Machines

In [44]:
# values will be scaled using equation (val-mean)/std
svc = make_pipeline(StandardScaler(), LinearSVC())

# cross validation
scores = cross_val_score(svc, X_train, y_train, cv=10)

# accuracy
print(f'Accuracy for Linear SVC: {scores.mean()}')



Accuracy for Linear SVC: 0.7890137328339576




### 6. Predict on Test

### 6.1 Vectorize

In [32]:
# test data should have the same format (columns) as training examples
columns = X_train.columns

In [33]:
test_enc_cat = test[columns].apply(LabelEncoder().fit_transform)

X_test = test_enc_cat

### 6.2 Predict with best model

Best model up to now is logistic regression classifier, with training accuracy 77%.

In [63]:
best_model = lr

# train model
best_model.fit(X_train, y_train)

# predict on test data
y_pred = best_model.predict(X_test)

## 8 Submission

In [64]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = y_pred

In [65]:
print(f'Submission size: {submission.shape}')
submission.head()

Submission size: (418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [66]:
submission.to_csv('/kaggle/working/decision-tree.csv', index=False)