In [18]:
import pandas as pd
import numpy as np
import pickle
import redis

In [3]:
# load test dataset
df_test = pd.read_csv('data/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
redis_conn = redis.StrictRedis(host='redis', port=6379)

In [6]:
unpacked_object = pickle.loads(redis_conn.get('titanic_challenge'))
unpacked_object

Apply the same methods os data preprocessing to format the dataset

In [7]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
df_test.Age.fillna(df_test.Age.mode()[0], inplace=True)
df_test.Fare.fillna(df_test.Fare.median(), inplace=True)

In [9]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [10]:
dummy_embarked = pd.get_dummies(df_test.Embarked)
dummy_sex = pd.get_dummies(df_test.Sex)
df_test = pd.concat([df_test, dummy_sex, dummy_embarked], axis = 1)

In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [15]:
age_scaler = MinMaxScaler().fit(df_test.Age.values.reshape(len(df_test), 1))
fare_scaler = StandardScaler().fit(df_test.Fare.values.reshape(len(df_test), 1))
pclass_scaler = StandardScaler().fit(df_test.Pclass.values.reshape(len(df_test), 1))

In [16]:
df_test.Age = age_scaler.transform(df_test.Age.values.reshape(len(df_test), 1))
df_test.Fare = fare_scaler.transform(df_test.Fare.values.reshape(len(df_test), 1))
df_test.Pclass = pclass_scaler.transform(df_test.Pclass.values.reshape(len(df_test), 1))

In [19]:
# Convert SibSp and Parch to only one column: HasRelatives
df_test['HasRelatives'] = np.where(
    (df_test.SibSp > 1) | (df_test.Parch > 1), 1, 0
)
df_test.sample(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male,C,Q,S,HasRelatives
187,1079,0.873482,"Davies, Mr. Joseph",male,0.221944,2,0,A/4 48873,-0.493455,,S,0,1,0,0,1,1
57,949,0.873482,"Abelseth, Mr. Olaus Jorgensen",male,0.327443,0,0,348122,-0.500626,F G63,S,0,1,0,0,1,0
335,1227,-1.50512,"Maguire, Mr. John Edward",male,0.39338,0,0,110469,-0.171674,C106,S,0,1,0,0,1,0
388,1280,0.873482,"Canavan, Mr. Patrick",male,0.274693,0,0,364858,-0.498833,,Q,0,1,0,1,0,0
347,1239,0.873482,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,0.498879,0,0,2688,-0.508169,,C,1,0,1,0,0,0
9,901,0.873482,"Davies, Mr. John Samuel",male,0.274693,2,0,A/4 48871,-0.204838,,S,0,1,0,0,1,1
382,1274,0.873482,"Risien, Mrs. Samuel (Emma)",female,0.274693,0,0,364498,-0.377829,,S,1,0,0,0,1,0
34,926,-1.50512,"Mock, Mr. Philipp Edmund",male,0.39338,1,0,13236,0.397493,C78,C,0,1,1,0,0,0
358,1250,0.873482,"O'Keefe, Mr. Patrick",male,0.274693,0,0,368402,-0.498833,,Q,0,1,0,1,0,0
407,1299,-1.50512,"Widener, Mr. George Dunton",male,0.657128,1,1,113503,3.153697,C80,C,0,1,1,0,0,0


In [20]:
df_test_predict = df_test.drop(['PassengerId', 'Sex','Embarked', 'Cabin', 'Name', 'Ticket', 'SibSp', 'Parch'], axis='columns')

In [21]:
prediction = unpacked_object.predict(df_test_predict)
prediction

Feature names must be in the same order as they were in fit.



array([1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,

In [22]:
output = df_test
output['Survived'] = prediction
output

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male,C,Q,S,HasRelatives,Survived
0,892,0.873482,"Kelly, Mr. James",male,0.452723,0,0,330911,-0.497413,,Q,0,1,0,1,0,0,1
1,893,0.873482,"Wilkes, Mrs. James (Ellen Needs)",female,0.617566,1,0,363272,-0.512278,,S,1,0,0,0,1,0,0
2,894,-0.315819,"Myles, Mr. Thomas Francis",male,0.815377,0,0,240276,-0.464100,,Q,0,1,0,1,0,0,1
3,895,0.873482,"Wirz, Mr. Albert",male,0.353818,0,0,315154,-0.482475,,S,0,1,0,0,1,0,1
4,896,0.873482,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,0.287881,1,1,3101298,-0.417492,,S,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0.873482,"Spector, Mr. Woolf",male,0.274693,0,0,A.5. 3236,-0.493455,,S,0,1,0,0,1,0,1
414,1306,-1.505120,"Oliva y Ocana, Dona. Fermina",female,0.512066,0,0,PC 17758,1.314435,C105,C,1,0,1,0,0,0,0
415,1307,0.873482,"Saether, Mr. Simon Sivertsen",male,0.505473,0,0,SOTON/O.Q. 3101262,-0.507796,,S,0,1,0,0,1,0,1
416,1308,0.873482,"Ware, Mr. Frederick",male,0.274693,0,0,359309,-0.493455,,S,0,1,0,0,1,0,1


In [23]:
output.to_csv('data/output.csv', columns = ['PassengerId', 'Survived'], index = False)