In [1]:
'''
TODO:
    transformers:
        OneHotEncode columns
            sex(DONE), cabin(DONE), embarked(DONE)
        Figure out how to add these to a pipeline
        extract numerics from ticket to get families traveling together(DONE)
    END: create data processing pipeline
DONE:

'''
print('')




In [2]:
# https://www.kaggle.com/c/titanic/overview

import pandas as pd
import numpy as np

titanic_raw = pd.read_csv('./titanic_train.csv')
print(titanic_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [3]:
titanic_raw.head()

'''
columns to remove
    PassengerID
    Name
    Ticket
    ? Fare
categorical
    sex
    ? Cabin
    Embarked (C = Cherbourg, Q = Queenstown, S  = Southampton)
'''

'\ncolumns to remove\n    PassengerID\n    Name\n    Ticket\n    ? Fare\ncategorical\n    sex\n    ? Cabin\n    Embarked (C = Cherbourg, Q = Queenstown, S  = Southampton)\n'

In [4]:
# Let's define a custom Transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Make OneHot encoding a column suck less
def OneHotSetup(X, onehotcol, dropfirst=True, droporiginal=False):
    # Create a new OneHotEncoder
    if dropfirst:
        dropfirst = 'first'
        hdr_offset = 1
    else:
        dropfirst = None
        hdr_offset = 0
        
    onehot = OneHotEncoder(sparse=False, drop=dropfirst) # remove 1st category
    
    # Fit our encoder
    X_onehot = onehot.fit(X[onehotcol].values.reshape(-1,1)) # Fit onehot to cabin_letter
    
    # Get a list of column names, minus the first one
    X_onehot_hdrs = X_onehot.categories_[0][hdr_offset:] # Take the list of categories minus the first one
    
    # Transform header to OG column name underscore category
    temp = []
    for val in X_onehot_hdrs:
        val = onehotcol + '_' + val
        temp.append(val)
    X_onehot_hdrs = temp
    
    # Create a DataFrame of new OneHot columns
    X_onehot_df = pd.DataFrame(onehot.transform(X[onehotcol].values.reshape(-1,1)))

    # Redefine the headers of the onehot cols
    X_onehot_df.columns = X_onehot_hdrs
    
    # Add new onehot columns to the original DF 
    X = pd.concat([X, X_onehot_df], axis=1)
    
    # Delete OneHot-ted column, if requested
    if droporiginal:
        X = X.drop([onehotcol], axis=1)
    
    return X

# Custom Transform class
class InitAttributeCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
            return self # Nothing to do here, apparently
    def transform(self, X, y=None):
        # Remove unnecessary columns
        X = X.drop(['PassengerId', 'Name'],axis=1)
        
        # Handle the "Cabin" feature
        #     1. split into features "cabin_letter" and "cabin_number"
        #     2. replace nulls in both new columns with 'unreported'
        #     3. Use OneHotSetup to add onehotcolumns
        #     4. Replace NaN "cabin_number"s with 0, transform column to numeric
          #cabin_letter
        newcols = ['cabin_letter', 'cabin_number']                                 # New column names
        X[newcols] = X['Cabin'].str.extract(r'(\D*)(\d*)') # Split column
        X['cabin_letter'] = X[newcols].fillna('unreported')       # Replace nulls with 'unreported'
        X['cabin_letter'] = X[newcols].replace('', 'unreported')  # Replace nulls with 'unreported'
        X = OneHotSetup(X, 'cabin_letter', dropfirst=True, droporiginal=True)  # Call OneHotSetup
        X = X.drop(['cabin_letter_unreported'], axis=1)    # Remove cabin_letter_unreported (useless)
          #cabin_number
        X['cabin_number'] = X['cabin_number'].fillna(value=0) # Replace NA with 0
        X['cabin_number'] = X['cabin_number'].replace('', 0).astype('int64') # Replace '' with 0 and cast to int64
        X = X.drop(['Cabin'], axis=1)

        # Handle "sex" feature
        X = OneHotSetup(X, 'Sex', dropfirst=True, droporiginal=True)

        # Handle "Embarked" feature
        X['Embarked'] = X['Embarked'].fillna('U')  # Fill missing values with U for Unknown
        X = OneHotSetup(X, 'Embarked', dropfirst=False, droporiginal=False) # OneHotEncode it
        X = X.drop(['Embarked', 'Embarked_U'], axis=1)   # Remove Unknown category, original category

        # Handle "Age" feature (we're going to impute a mean for missing values)
        X['Age'] = X['Age'].fillna(X['Age'].mean())

        # Handle "Ticket" column (remove characters, leave only numerics)
        X['ticket_number'] = X['Ticket'].str.extract(r'(?:.*? ){0,2}(\d*)') # Extract just the numeric at the end
        X['ticket_number'] = X['ticket_number'].replace('', 0).astype('int64') # These few "LINE" values that are missed by RE
        X = X.drop(['Ticket'], axis=1)
        
        self.final_columns = list(X.columns)

        return X

In [5]:
# Data Cleaning (Pipeline conversion)
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


standardize_columns = ['Age', 'Fare', 'cabin_number', 'ticket_number']

columntransformer = ColumnTransformer([
                        ('standardizer', StandardScaler(), standardize_columns)
                ], remainder='passthrough',)

pipe = Pipeline([
                    ('init_clean', InitAttributeCleaner()),
                    ('col_trans', columntransformer)
                ])
'''
what a shitty to use API

The order of the columns in the transformed feature matrix follows the order of how the columns are specified in the
transformers list. Columns of the original feature matrix that are not specified are dropped from the resulting transformed
feature matrix, unless specified in the passthrough keyword. Those columns specified with passthrough are added at the right
to the output of the transformers.
'''
titanic_clean = pd.DataFrame(pipe.fit_transform(titanic_raw))

# transformed columns have the passthrough columns added after, so need to come up with new column order and rename
passthrough_columns = [x for x in pipe.named_steps.init_clean.final_columns if x not in standardize_columns]

titanic_clean.columns = standardize_columns + passthrough_columns

print(titanic_clean.head())

        Age      Fare  cabin_number  ticket_number  Survived  Pclass  SibSp  \
0 -0.592481 -0.502445     -0.421322      -0.420410       0.0     3.0    1.0   
1  0.638789  0.786845      2.738590      -0.425854       1.0     1.0    1.0   
2 -0.284663 -0.488854     -0.421322       4.274382       1.0     3.0    0.0   
3  0.407926  0.420730      4.151256      -0.279217       1.0     1.0    1.0   
4  0.407926 -0.486337     -0.421322       0.116544       0.0     3.0    0.0   

   Parch  cabin_letter_B  cabin_letter_C  ...  cabin_letter_E  cabin_letter_F  \
0    0.0             0.0             0.0  ...             0.0             0.0   
1    0.0             0.0             1.0  ...             0.0             0.0   
2    0.0             0.0             0.0  ...             0.0             0.0   
3    0.0             0.0             1.0  ...             0.0             0.0   
4    0.0             0.0             0.0  ...             0.0             0.0   

   cabin_letter_F E  cabin_letter_F G 

In [6]:
# Time to split our test/train sets before going any further!

from sklearn.model_selection import StratifiedShuffleSplit

# Okay so first I need to split my X and y
X = titanic_clean.drop(['Survived'], axis=1)
y = titanic_clean['Survived']

# And now split using stratified shuffle split
splitter = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index in splitter.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [7]:
'''
Okay now I need to shortlist some models
I have no idea how to do this
I suppose this is a binary classifier soooo
we'll start off with the SGD Classifier
'''
from sklearn.linear_model import SGDClassifier
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold

sgd_clf = SGDClassifier(random_state=42)
skfolds = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

for train_index, test_index in skfolds.split(X_train, y_train):
#     print('train:', train_index, 'test', test_index)
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train.iloc[train_index]
    y_train_folds = y_train.iloc[train_index]
    X_test_fold = X_train.iloc[test_index]
    y_test_fold = y_train.iloc[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.7689075630252101
0.7721518987341772
0.7468354430379747


In [14]:
from sklearn.model_selection import cross_val_score

sgd_scores = cross_val_score(sgd_clf, X_train, y_train, cv=5, scoring='accuracy')

In [15]:
class EveryoneDies(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

everyonedies_clf = EveryoneDies()
everysonedies_scores = cross_val_score(everyonedies_clf, X_train, y_train, cv=5, scoring='accuracy')

# Okay cool so I'm doing better than just guessing noone made it

In [20]:
# Alright let's try a RandomForestClassifier as well

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10, scoring='accuracy')

In [17]:
# Let's score out a Guassian Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
gnb_scores = cross_val_score(gnb_clf, X_train, y_train, cv=5, scoring='accuracy')

In [21]:
# Let's try a logistic classifier
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(multi_class='ovr')
lr_scores = cross_val_score(lr_clf, X_train, y_train, cv=10, scoring='accuracy')

In [22]:
print(f'SGD Classifier avg: {sgd_scores.mean()})')
print(f'EveryoneDies avg: {everysonedies_scores.mean()})')
print(f'RandomForestClassifier avg: {forest_scores.mean()})')
print(f'GaussianNB avg: {gnb_scores.mean()})')
print(f'SGD Classifier avg: {sgd_scores.mean()})')
print(f'LogisticRegression avg: {lr_scores.mean()})')

# So it looks like RandomForestClassifier is the best looking model so far!

SGD Classifier avg: 0.7416921107061952)
EveryoneDies avg: 0.6165468334482419)
RandomForestClassifier avg: 0.8273474178403756)
GaussianNB avg: 0.7037427361371023)
SGD Classifier avg: 0.7416921107061952)
LogisticRegression avg: 0.7977895148669797)


In [79]:
# quick test on forest classifier on test set?
forest_clf.fit(X_train, y_train)

aa = pd.DataFrame(forest_clf.predict(X_test)).value_counts()

correct = aa[0]
incorrect= aa[1]
print(f'percentage correct = {(correct/(incorrect+correct)*100)}')

# weak

percentage correct = 62.56983240223464


In [99]:
# Okay time to try tuning the forest

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, truncnorm, randint

forest_clf = RandomForestClassifier(random_state=42)

model_params = {"n_estimators" : randint(1, 200),
                "max_features" : truncnorm(a=0., b=1., loc=0.5, scale=0.01),
                "min_samples_split" : uniform(0.01, 0.199)
               }
               

rscv = RandomizedSearchCV(forest_clf, model_params, n_iter = 50, cv=5,
                          n_jobs = 2, scoring="accuracy")

rscv.fit(X_train, y_train)

print(rscv.best_score_)

0.8343740766275978


In [102]:
new_forest_clf = rscv.best_estimator_

# quick test on new forest classifier on test set
aa = pd.DataFrame(new_forest_clf.predict(X_test)).value_counts()

correct = aa[0]
incorrect= aa[1]
print(f'percentage correct = {(correct/(incorrect+correct)*100)}')

# weak

percentage correct = 66.4804469273743


In [103]:
# What if i trried the LR model

lr_clf = LogisticRegression(multi_class='ovr')

model_params = {"n_estimators" : randint(1, 200),
                "max_features" : truncnorm(a=0., b=1., loc=0.5, scale=0.01),
                "min_samples_split" : uniform(0.01, 0.199)
               }
               

rscv_lr = RandomizedSearchCV(forest_clf, model_params, n_iter = 50, cv=5,
                          n_jobs = 2, scoring="accuracy")

rscv_lr.fit(X_train, y_train)

print(rscv_lr.best_score_)

0.838550182212154


In [104]:
new_lr_clf = rscv_lr.best_estimator_

# quick test on new forest classifier on test set
aa = pd.DataFrame(new_lr_clf.predict(X_test)).value_counts()

correct = aa[0]
incorrect= aa[1]
print(f'percentage correct = {(correct/(incorrect+correct)*100)}')

# weak

percentage correct = 66.4804469273743
