# Notebook dedicated to building model that predicts whether a passeneger survives the titanic given features

## From kaggle:
- https://www.kaggle.com/c/titanic/


In [3]:
### data manipulation ###
import pandas as pd
import numpy as np

### sklearn ###
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

### visualizations ###
import matplotlib.pyplot as plt
import seaborn as sns

### 1. Examine and load data

Data Dictionary: 

#### Features : 
- Pclass : ticket class of passenger [1 = 1st, 2 = 2nd, 3 = 3rd]
- Name : name of passenger
- Sex : sex of passenger
- Age : age of passenger
- Sibsp : # of siblings / spouses aboard the Titanic
- Parch : # of parents / children aboard the Titanic
- Ticket : Ticket number
- Fare : Passenger fare
- Cabin : cabin number of passenger
- Embarked : Port of Embarktion [C = Cherbourg, Q = Queenstown, S = Southampton]

#### Target:
- Survival : if passenger survived [0 = no, 1 = yes]

In [6]:
test_df_og = pd.read_csv('test.csv')
train_df_og = pd.read_csv('train.csv')

print(f'Shape of test df : {test_df_og.shape}\nShape of train df : {train_df_og.shape}')

print(f'Dtypes of test df : {test_df_og.dtypes}\nDtypes of train df : {train_df_og.dtypes}')

Shape of test df : (418, 11)
Shape of train df : (891, 12)
Dtypes of test df : PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
Dtypes of train df : PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [7]:
# sample of training df
train_df_og.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 2. Cleaning data

In [8]:
# getting counts of na's per column
print(train_df_og.isna().sum())
print(test_df_og.isna().sum())

# for both train and test, cabin and age have significant na count
# get rid of cabin alltogether 

train_df = train_df_og.drop('Cabin', axis=1)
test_df = test_df_og.drop('Cabin', axis=1)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [9]:
# dropping remaining na's
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [10]:
# sample df with na's and cabin removed
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [11]:
### there are 541 unique tickets 
### for now will just drop ticket and name cols

#train_df.drop(['Name', 'Ticket'], axis=1, inplace=True)
#test_df.drop(['Name', 'Ticket'], axis=1, inplace=True)


### 3. Transforming data for useability

- must change sex to 1 or 0
- must change embarked to [0, 1, or 2]
    - `S` = 0
    - `C` = 1
    - `Q` = 2

In [12]:
## next two cells transform data into useable form 

In [13]:
train_df['Sex'] = train_df['Sex'].apply({'male' : 1, 'female' : 0}.get)

In [14]:
train_df['Embarked'] = train_df['Embarked'].apply({'S' : 0,
                                                   'C' : 1,
                                                   'Q' : 2}.get)

In [15]:
test_df['Sex'] = test_df['Sex'].apply({'male' : 1,
                                       'female' : 0}.get)

test_df['Embarked'] = test_df['Embarked'].apply({'S' : 0,
                                                 'C' : 1,
                                                 'Q' : 2}.get)

### 4. Split data and build model

- try 3 separate models:
 * Linear SVC
 * Kneighbors Classifier
 * RandomForest Classifier

In [16]:
# still only using traning data 

In [17]:
# create training split
X_train, y_train = train_df.drop('Survived', axis=1), train_df['Survived']

In [18]:
class Score():
    def __init__(self, X_train, y_train):
        
        self.X_train = X_train
        self.y_train = y_train
        
        # instantiating models
        self.SVC = LinearSVC()
        self.KNN = KNeighborsClassifier()
        self.RFC = RandomForestClassifier(n_estimators=10)
    
    def fit_models(self):
        
        fitted_svc = self.SVC.fit(self.X_train, self.y_train)
        fitted_knn = self.KNN.fit(self.X_train, self.y_train)
        fitted_rfc = self.RFC.fit(self.X_train, self.y_train)
        
        return fitted_svc, fitted_knn, fitted_rfc

    def model_evals(self):
        '''
        Function that trains 3 classifers and returns accuracy score

        Inputs
        ------
        X_train, y_train
        '''

        score_dict = {
            'Linear SVC' : self.SVC.score(self.X_train, self.y_train),
            'Kneighbors' : self.KNN.score(self.X_train, self.y_train),
            'RandomForest' : self.RFC.score(self.X_train, self.y_train)
        }

        return score_dict


In [19]:
# creating instance of our model class
instantiated = Score(X_train, y_train)

# fitting the models
fitted_models = instantiated.fit_models()

# getting score dict
scores = instantiated.model_evals()

# print out scores and decide what model to use
print(scores)

{'Linear SVC': 0.7752808988764045, 'Kneighbors': 0.7584269662921348, 'RandomForest': 0.9845505617977528}




### 5. Decision and prediction
 - Use RandomForestRegressor, score is by far highest
 - make preds

In [20]:
X_test = test_df

In [21]:
y_preds = fitted_models[2].predict(X_test)

## TO DO:
- Randomized CV
- Visualizations
- Identify most important features
- Submit