In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# pd.isnull(df).sum()

## Loading Dataset 

In [2]:
df = pd.read_csv('dataset/test.csv')
df.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


## Preprocessing

In [3]:
# create family variable
df['Family_size'] = df['Parch'] + df['SibSp'] + 1
# create title feature
df['Title'] = df['Name'].apply(lambda x : x.split('.')[0].split(',')[1].strip())
df.loc[df['Title'] == 'Mlle','Title'] = 'Miss'
df.loc[df['Title'] == 'Mme', 'Title'] = 'Miss'
df.loc[df['Title'] == 'Ms', 'Title'] = 'Miss'
rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Don','Dona', 'Lady', 'the Countess','Capt','Sir', 'Jonkheer']
for r in rare_titles:
    df.loc[df['Title'] == r, 'Title'] = 'Rare Title'

df['IsAlone'] = 0
df.loc[df['Family_size'] == 1, 'IsAlone'] = 1
# remove name, cabin, ticket, 'PassengerId'

# keep passenger IDs for future reference
passengerIds = list(df['PassengerId'])

df.drop(['Name', 'Cabin', 'Ticket', 'PassengerId', 'Embarked'], axis = 1, inplace = True)

# Reorder columns
['Title', 'Age', 'Sex', 'Pclass', 'IsAlone', 'Family_size', 'SibSp', 'Parch', 'Fare', 'Survived']
new_order = [7, 2, 1, 0, 8, 6, 4, 3, 5]
df = df[df.columns[new_order]]
df.sample(3)

## Converting character variables to numeric
df['Sex'] = df['Sex'].map( {'male': 0, 'female': 1} ).astype(int) 
df['Title'] = df['Title'].map({'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3, 'Rare Title':4}).astype(int)

## Filling NA values
# Age Imputation - Applying linear regression on resulting variables to predict age

df.loc[pd.isnull(df['Fare']), 'Fare'] = df['Fare'].mean()
df['Age'] = df.interpolate()['Age']

## Binning Age variable
bins = [0, 2, 12, 18, 25, 54, 65, np.inf]
# labels = ['baby','child','teenager',"young adult",'adult','senior','older person']
labels = [0,1,2,3,4,5,6]
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)

df.groupby(['Age_Group'])['Age'].mean()

new_order = [0,1,9,2,3,4,5,6,7,8]
df = df[df.columns[new_order]]

In [4]:
df.sample(3)

Unnamed: 0,Title,Age,Age_Group,Sex,Pclass,IsAlone,Family_size,Parch,SibSp,Fare
216,1,49.5,4,1,3,1,1,0,0,7.8792
373,0,44.0,4,0,2,1,1,0,0,13.0
310,0,18.0,2,0,3,1,1,0,0,8.6625


# Exporting Output

## Impot model from pickle file, predict probabilities and export csv

In [20]:
import pickle
model = pickle.load(open('model.pkl', 'rb'))

prob = list(model.predict_proba(df)[:,1])
predictions = [1  if x > 0.5 else 0 for x in prob]
df_out = pd.DataFrame({'PassengerId':passengerIds, 'Survived': predictions})
df_out.to_csv('py_git_submission_rf.csv', index = False)

In [24]:
print(list(df_out[df_out['Survived'] == 1]['PassengerId']))

[900, 904, 906, 907, 913, 914, 916, 918, 920, 924, 926, 931, 935, 936, 938, 940, 941, 944, 945, 951, 955, 957, 961, 964, 966, 969, 972, 978, 980, 981, 982, 988, 989, 992, 996, 1004, 1005, 1006, 1009, 1011, 1012, 1014, 1017, 1019, 1020, 1022, 1033, 1042, 1045, 1048, 1051, 1052, 1053, 1054, 1057, 1060, 1065, 1067, 1068, 1070, 1071, 1074, 1076, 1078, 1084, 1086, 1088, 1091, 1092, 1093, 1095, 1098, 1100, 1105, 1108, 1109, 1110, 1112, 1114, 1115, 1116, 1117, 1119, 1123, 1128, 1130, 1131, 1132, 1133, 1138, 1140, 1141, 1142, 1148, 1150, 1154, 1155, 1164, 1165, 1167, 1173, 1174, 1175, 1176, 1188, 1191, 1197, 1199, 1200, 1204, 1205, 1206, 1207, 1208, 1215, 1216, 1218, 1222, 1225, 1231, 1235, 1237, 1241, 1242, 1246, 1248, 1253, 1254, 1256, 1260, 1263, 1266, 1267, 1274, 1275, 1277, 1283, 1284, 1287, 1289, 1292, 1294, 1300, 1301, 1303, 1304, 1306, 1309]
