In [24]:
import  pandas as pd
import numpy as np 
import streamlit as st


## Upload and clean the data

In [25]:
train = pd.read_csv('data/train.csv')

In [26]:
# check missing valuies
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [27]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [28]:
# drop some unnecessary columns
train = train.drop(columns= ['Name','PassengerId','Cabin','Ticket'])

In [40]:
# create classs of dataframe selector
# create also attribute adder
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        return X[self.attribute_names]
        
class RelAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self 
    def transform(self, X, y = None):
        rel_count = X[:, 1] + X[:, 2]
        
        return np.c_[X,rel_count]

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('select_numeric', DataFrameSelector(['Age','SibSp','Parch','Fare'])),
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', RelAdder()),
        ('std_scaler', StandardScaler()),
    ])

X_num_trans = num_pipeline.fit_transform(X)

In [58]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [60]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('select_cat', DataFrameSelector(['Pclass','Sex','Embarked'])),
    ('imputer', MostFrequentImputer()),
    ('cat_encoder', OneHotEncoder(sparse= False))
])

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [62]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list= [
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

In [71]:
# create preprocess
X_train = preprocess_pipeline.fit_transform(train)
y_train = train['Survived']

In [104]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)

SVC(gamma='auto')

In [105]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.7318352059925094

In [106]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8036828963795255

In [110]:
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

gridsearch_rf = GridSearchCV(RandomForestClassifier(n_estimators= 100, random_state= 42), param_grid= param_grid, cv = 5)

In [111]:
gridsearch_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [112]:
gridsearch_rf.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 200}

In [116]:
# test data 

test_data = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/gender_submission.csv')

In [117]:
X_test = preprocess_pipeline.fit_transform(test_data)
test_preds = gridsearch_rf.best_estimator_.predict(X_test)

In [120]:
submission['Survived'] = test_preds
submission.to_csv('submission1.csv', index= False)

## Create test for one passenger

In [205]:
# test pred 0 did not survive
test_preds[0]

0

In [206]:
sample1 = test_data[test_data.index == 0]

In [208]:
# details of sample 1
sample1

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q


In [211]:
# using 1 index dataframe, the preprocess pipeline wont work
sample1_preproc = preprocess_pipeline.fit_transform(sample1)
sample1_preproc.shape

(1, 8)

In [213]:
# create from input data
input_data = {
    'Pclass': 3,
    'Name' : 'Mark Anthony Sanchez',
    'Sex' : 'male',
    'Age': 34.5,
    'SibSp': 0,
    'Parch': 0,
    'Fare': 7.8292,
    'Embarked' : 'Q'
}

input_features = pd.DataFrame(input_data, index= [0])
input_features

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,Mark Anthony Sanchez,male,34.5,0,0,7.8292,Q


In [214]:
# combine the input feature to test
combined = pd.concat([input_features, test])

In [215]:
test_feature_preproc = preprocess_pipeline.fit_transform(combined)[0]
test_feature_preproc

array([34.5   ,  0.    ,  0.    ,  7.8292,  0.    ,  0.    ,  0.    ,
        1.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ])

In [216]:
# predict
gridsearch_rf.best_estimator_.predict([test_feature_preproc])

array([0], dtype=int64)

## save the models and try to load the models

In [217]:
best_model = gridsearch_rf.best_estimator_
best_model

RandomForestClassifier(max_depth=8, n_estimators=200, random_state=42)

In [218]:
from pickle import dump
dump(best_model, open('rf_gridsearch.pkl', 'wb'))
dump(preprocess_pipeline, open('preprocess_pipeline.pkl', 'wb'))

In [219]:
from pickle import load
load_model = load(open('rf_gridsearch.pkl', 'rb'))
load_preprocess_pipeline = load(open('preprocess_pipeline.pkl', 'rb'))

In [1]:
load_model.predict([test_feature_preproc])[0]

NameError: name 'load_model' is not defined

In [223]:
load_preprocess_pipeline.fit_transform(combined)[0]

array([34.5   ,  0.    ,  0.    ,  7.8292,  0.    ,  0.    ,  0.    ,
        1.    ,  0.    ,  1.    ,  0.    ,  1.    ,  0.    ])