 Quick logistic regression model (score : 0.819) saved as pickle to be use. 

## Import modules 

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import pickle

## Functions 

In [2]:
def get_data(path_train_data, path_test_data) : 
    '''Function that imports data from the specified paths. Test and training data 
       come from https://www.kaggle.com/c/titanic/submissions?group=all&page=1&pageSize=100 '''
    try : 
        train_data = pd.read_csv(path_train_data, sep=',')
        test_data = pd.read_csv(path_test_data, sep=',')
    except : 
        train_data = pd.read_csv(path_train_data)
        test_data = pd.read_csv(path_test_data)
    return train_data, test_data

In [3]:
def age_categories(x) :
    '''Takes an age as input and returns the age category. It will be applied on Age arrays'''
    if x == 0 :
        cat = 0
    elif x <= 18 :
        cat = 1 
    elif x <= 60 : 
        cat = 2 
    else : 
        cat = 3
    return cat 

In [4]:
class DataPreparation : 
    def __init__(self, df) :
        self.df = df
    def filling_data(self, imp) : 
        array = imp.fit_transform(self.df)
        self.df = pd.DataFrame(array, columns=self.df.columns)
    def feature_engineering(self) :
        try : 
            self.df['age_category'] = self.df.Age.apply(age_categories)
            return self.df
        except Exception as e : 
            print("Age feature is not available into the dataframe")
    def vectorize_train_set(self) : 
        X = pd.get_dummies(self.df[['Pclass', 'Sex', 'Embarked', 'age_category']]).values
        y = self.df['Survived'].values.astype('int')
        return X, y
    def vectorize_test_set(self) : 
        X = pd.get_dummies(self.df[['Pclass', 'Sex', 'Embarked', 'age_category']]).values
        return X

## Import data and prepare it

In [6]:
train_data, test_data = get_data('data/train.csv', 'data/test.csv')
train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Training

In [7]:
imp = SimpleImputer(strategy="most_frequent")

# Training data 
final_train_data = DataPreparation(train_data)
final_train_data.filling_data(imp)
final_train_data.feature_engineering()
X, y = final_train_data.vectorize_train_set()


# Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
clf = clf.fit(X, y)
print(f"train score is : { round(clf.score(X, y), 3)}")

train score is : 0.819


## Predictions and submission

In [9]:
# Predictions on test 
final_test_data = DataPreparation(test_data)
final_test_data.filling_data(imp)
final_test_data.feature_engineering()
X_test = final_test_data.vectorize_test_set()
print(X_test.shape)
predictions = clf.predict(X_test)

# Saving results 
results = pd.DataFrame({'PassengerId' : test_data.PassengerId, 
             'Survived' : predictions })
results.to_csv('data/my_submission.csv', index=False)

(418, 9)


## Save the model 

In [10]:
name="models/rf_model.sav"
pickle.dump(clf, open(name, 'wb'))