In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

In [2]:
def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df    
    
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

data_train = transform_features(data_train)
data_test = transform_features(data_test)

In [3]:
from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Lname', 'NamePrefix']
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test
    
data_train, data_test = encode_features(data_train, data_test)

In [4]:
from sklearn.model_selection import train_test_split

X_all = data_train.drop(['Survived', 'PassengerId'], axis=1)
y_all = data_train['Survived']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

X_train.head()
y_train.head()

151    1
753    0
746    0
684    0
887    1
Name: Survived, dtype: int64

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=5, max_features='log2',
                       min_samples_leaf=5, n_estimators=9)

In [6]:
print(type(X_train))
type(data_test)
data_test.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,892,3,1,7,0,0,0,7,401,19
1,893,3,0,0,1,0,0,7,843,20
2,894,2,1,3,0,0,1,7,552,19
3,895,3,1,7,0,0,1,7,851,19
4,896,3,0,4,1,1,1,7,342,20


In [7]:
ids = data_test['PassengerId']
predictions = clf.predict(data_test.drop('PassengerId', axis=1))
output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [8]:
def predict_servival(Pclass, Sex, Age, SibSp, Parch, Fare, Cabin, Lname, NamePrefix):    
    
    x = np.zeros(len(data_test.columns) -1)
    x[0] = Pclass
    x[1] = Sex
    x[2] = Age
    x[3] = SibSp
    x[4] = Parch
    x[5] = Fare
    x[6] = Cabin
    x[7] = Lname
    x[8] = NamePrefix
    

    return clf.predict([x])[0]

In [9]:
len(data_test.columns) -1
x = np.zeros(len(data_test.columns) -1)
len(x)

zz = predict_servival(1, 0, 0, 1, 0, 3, 2, 182, 20)
zz

1

In [10]:
# Creating a pickle file for the model
filename = 'Titanic.pkl'
pickle.dump(clf, open(filename, 'wb'))

In [11]:
data_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
0,892,3,1,7,0,0,0,7,401,19
1,893,3,0,0,1,0,0,7,843,20
2,894,2,1,3,0,0,1,7,552,19
3,895,3,1,7,0,0,1,7,851,19
4,896,3,0,4,1,1,1,7,342,20


In [12]:
a = np.array([[3, 1, 7, 0, 0, 0, 7, 401, 19]])

In [13]:
x = np.zeros(3)

In [14]:
type(x)
type(a)
a.ndim

2

In [15]:
b = pd.DataFrame(a)

In [16]:
b

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,3,1,7,0,0,0,7,401,19


In [17]:
yy = clf.predict(b)

In [18]:
yy

array([0], dtype=int64)

In [19]:
data_train[1:2]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Lname,NamePrefix
1,2,1,1,0,0,1,0,3,2,182,20


In [20]:
a = np.array([[1, 0, 0, 1, 0, 3, 2, 182, 20]])

In [21]:
zz = clf.predict(a)

In [22]:
zz

array([1], dtype=int64)

In [23]:
import os

In [24]:
os.getcwd()

'C:\\Users\\hares\\PycharmProjects\\Titanic1'

In [29]:
xx= data_test.columns[1:]
xx

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Lname',
       'NamePrefix'],
      dtype='object')

In [30]:
#save the data columns
import json

columns = {'data_columns' : [col.lower() for col in data_test.columns[1:]]}

with open("columns.json","w") as f:
    f.write(json.dumps(columns))    