In [4]:
import urllib.request
import os

In [11]:
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="data/titanic3.xls"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

In [12]:
import numpy
import pandas as pd

In [13]:
all_df=pd.read_excel(filepath)

In [14]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [15]:
cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
all_df=all_df[cols]

In [16]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [17]:
df=all_df.drop(['name'], axis=1)

In [18]:
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [20]:
fare_mean=df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [21]:
age_mean=df['age'].mean()
df['age'] = df['age'].fillna(fare_mean)

In [22]:
df['sex']=df['sex'].map({'female':0,'male':1}).astype(int)

In [24]:
x_OneHot_df=pd.get_dummies(data=df, columns=["embarked"])

In [25]:
x_OneHot_df[:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0.0,0.0,1.0
1,1,1,1,0.9167,1,2,151.55,0.0,0.0,1.0


In [26]:
ndarray =x_OneHot_df.values

In [27]:
ndarray.shape

(1309, 10)

In [29]:
ndarray[1300:1304]

array([[  1.        ,   3.        ,   0.        ,  15.        ,
          1.        ,   0.        ,  14.4542    ,   1.        ,
          0.        ,   0.        ],
       [  0.        ,   3.        ,   1.        ,  45.5       ,
          0.        ,   0.        ,   7.225     ,   1.        ,
          0.        ,   0.        ],
       [  0.        ,   3.        ,   1.        ,  33.29547928,
          0.        ,   0.        ,   7.225     ,   1.        ,
          0.        ,   0.        ],
       [  0.        ,   3.        ,   1.        ,  33.29547928,
          0.        ,   0.        ,  14.4583    ,   1.        ,
          0.        ,   0.        ]])

In [31]:
Label =ndarray[:,0]
Features=ndarray[:,1:]

In [32]:
Label[:2]

array([ 1.,  1.])

In [33]:
Features[:2]

array([[   1.    ,    0.    ,   29.    ,    0.    ,    0.    ,  211.3375,
           0.    ,    0.    ,    1.    ],
       [   1.    ,    1.    ,    0.9167,    1.    ,    2.    ,  151.55  ,
           0.    ,    0.    ,    1.    ]])

In [34]:
from sklearn import preprocessing

In [35]:
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))

In [36]:
scaledFeatures=minmax_scale.fit_transform(Features)

In [37]:
scaledFeatures[:2]

array([[ 0.        ,  0.        ,  0.36116884,  0.        ,  0.        ,
         0.41250333,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.00939458,  0.125     ,  0.22222222,
         0.2958059 ,  0.        ,  0.        ,  1.        ]])

In [38]:
msk=numpy.random.rand(len(all_df))<0.8
train_df=all_df[msk]
test_df=all_df[~msk]

In [62]:
print('total:',len(all_df),'train:',len(train_df),'test:',len(test_df))

total: 1309 train: 1038 test: 271


In [63]:
def PreprocessData(raw_df):
    df=raw_df.drop(['name'],axis=1)
    age_mean =df['age'].mean()
    df['age']=df['age'].fillna(age_mean)
    fare_mean =df['fare'].mean()
    df['fare']=df['fare'].fillna(fare_mean)
    df['sex']=df['sex'].map({'female':0,'male':1}).astype(int)
    x_OneHot_df=pd.get_dummies(data=df, columns=["embarked"])
    
    ndarray =x_OneHot_df.values
    Features=ndarray[:,1:]
    Label =ndarray[:,0]
    
    minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))
    scaledFeatures=minmax_scale.fit_transform(Features)
    
    return scaledFeatures, Label

In [64]:
train_Features, train_Label=PreprocessData(train_df)
test_Features,test_Label=PreprocessData(test_df)

In [65]:
train_Features[:2]

array([[ 0.        ,  0.        ,  0.3598329 ,  0.        ,  0.        ,
         0.41250333,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.00732301,  0.125     ,  0.22222222,
         0.2958059 ,  0.        ,  0.        ,  1.        ]])

In [66]:
train_Label[:2]

array([ 1.,  1.])

In [67]:
import numpy
import pandas as pd
from sklearn import preprocessing
numpy.random.seed(10)

In [68]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [53]:
model=Sequential()

In [69]:
model.add(Dense(units=40, input_dim=9,kernel_initializer='uniform',activation='relu'))

In [70]:
model.add(Dense(units=30,kernel_initializer='uniform',activation='relu'))

In [71]:
model.add(Dense(units=1,kernel_initializer='uniform',activation='sigmoid'))

In [76]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [77]:
train_history=model.fit(x=train_Features,y=train_Label,validation_split=0.1,epochs=30,batch_size=30,verbose=2)

Train on 934 samples, validate on 104 samples
Epoch 1/30
0s - loss: 0.6749 - acc: 0.5964 - val_loss: 0.6128 - val_acc: 0.7500
Epoch 2/30
0s - loss: 0.6747 - acc: 0.5964 - val_loss: 0.6155 - val_acc: 0.7500
Epoch 3/30
0s - loss: 0.6747 - acc: 0.5964 - val_loss: 0.6142 - val_acc: 0.7500
Epoch 4/30
0s - loss: 0.6746 - acc: 0.5964 - val_loss: 0.6143 - val_acc: 0.7500
Epoch 5/30
0s - loss: 0.6748 - acc: 0.5964 - val_loss: 0.6137 - val_acc: 0.7500
Epoch 6/30
0s - loss: 0.6746 - acc: 0.5964 - val_loss: 0.6116 - val_acc: 0.7500
Epoch 7/30
0s - loss: 0.6749 - acc: 0.5964 - val_loss: 0.6100 - val_acc: 0.7500
Epoch 8/30
0s - loss: 0.6744 - acc: 0.5964 - val_loss: 0.6177 - val_acc: 0.7500
Epoch 9/30
0s - loss: 0.6747 - acc: 0.5964 - val_loss: 0.6146 - val_acc: 0.7500
Epoch 10/30
0s - loss: 0.6749 - acc: 0.5964 - val_loss: 0.6131 - val_acc: 0.7500
Epoch 11/30
0s - loss: 0.6747 - acc: 0.5964 - val_loss: 0.6161 - val_acc: 0.7500
Epoch 12/30
0s - loss: 0.6747 - acc: 0.5964 - val_loss: 0.6134 - val_acc

In [78]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

In [80]:
show_train_history(train_history,'acc','val_acc')

In [81]:
scores = model.evaluate(x=test_Features, 
                        y=test_Label)

 32/271 [==>...........................] - ETA: 0s

In [82]:
scores[1]

0.64206642110409329

In [83]:
Jack = pd.Series([0 ,'Jack',3, 'male'  , 23, 1, 0,  5.0000,'S'])
Rose = pd.Series([1 ,'Rose',1, 'female', 20, 1, 0, 100.0000,'S'])

In [84]:
JR_df = pd.DataFrame([list(Jack),list(Rose)],  
                  columns=['survived', 'name','pclass', 'sex', 
                   'age', 'sibsp','parch', 'fare','embarked'])

In [85]:
all_df=pd.concat([all_df,JR_df])

In [86]:
all_df[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,0,Jack,3,male,23.0,1,0,5.0,S
1,1,Rose,1,female,20.0,1,0,100.0,S


In [87]:
all_Features,Label=PreprocessData(all_df)

In [88]:
all_probability=model.predict(all_Features)

In [89]:
all_probability[:10]

array([[ 0.41284981],
       [ 0.41284981],
       [ 0.41284981],
       [ 0.41284981],
       [ 0.41284981],
       [ 0.41284981],
       [ 0.41284981],
       [ 0.41284981],
       [ 0.41284981],
       [ 0.41284981]], dtype=float32)

In [90]:
pd=all_df
pd.insert(len(all_df.columns),
          'probability',all_probability)

In [91]:
pd[-2:]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked,probability
0,0,Jack,3,male,23.0,1,0,5.0,S,0.41285
1,1,Rose,1,female,20.0,1,0,100.0,S,0.41285
