In [30]:
# import scientific libraries
import numpy as np
import pandas as pd
pd.set_option("future.no_silent_downcasting", True)

# The Machine learning alogorithm
from sklearn.ensemble import RandomForestClassifier

# Test train split
from sklearn.model_selection import train_test_split

# Just to switch off pandas warning
pd.options.mode.chained_assignment = None

# Used to write our model to a file
import joblib


In [56]:
# read in our data (actual data), hence the missing data (NaN = Not A Number) which means dirty or bad data
data = pd.read_csv("titanic_train.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,998,999,3rd,1,"McCarthy, Miss Katie",,,,,,,female
1,179,180,1st,0,"Millet, Mr Francis Davis",65.0,Southampton,"East Bridgewater, MA",,,(249),male
2,556,557,2nd,0,"Sjostedt, Mr Ernst Adolf",59.0,Southampton,"Sault St Marie, ON",,,,male
3,174,175,1st,0,"McCaffry, Mr Thomas Francis",46.0,Cherbourg,"Vancouver, BC",,,(292),male
4,1232,1233,3rd,0,"Strilic, Mr Ivan",,,,,,,male


In [58]:
data.columns # summon the column names just for testing

Index(['Unnamed: 0', 'row.names', 'pclass', 'survived', 'name', 'age',
       'embarked', 'home.dest', 'room', 'ticket', 'boat', 'sex'],
      dtype='object')

In [60]:
median_age = data['age'].median()
print("Median age is {}".format(median_age))

Median age is 29.0


In [62]:
# the data was not complete possibly due to laziness or carelessness. So we need to fill in the age.
data['age']= data['age'].fillna(median_age)
data['age'].head()

0    29.0
1    65.0
2    59.0
3    46.0
4    29.0
Name: age, dtype: float64

In [64]:
# Extracting relevant data - from the movie we knew that "Women and Children first, then First Class" had priority
# this is part of the ML feeding phase, we need to select relevant data
data_inputs = data[["pclass", "age", "sex"]]
data_inputs.head()

Unnamed: 0,pclass,age,sex
0,3rd,29.0,female
1,1st,65.0,male
2,2nd,59.0,male
3,1st,46.0,male
4,3rd,29.0,male


In [66]:
# part of the ML feeding involves expected data, so these are the actual values (1 lived,0 died)
# Note : As a "Hello world" example, our data set is very small. In the real world (say sports data) you would feed in GB or TBs of text (spreadsheet) data
expected_output = data[["survived"]]
expected_output.head()

Unnamed: 0,survived
0,1
1,0
2,0
3,0
4,0


In [68]:
# almost ready to run ML ALGO. We need to further clean data in context of format. The Sci Py expects numbers
# Replace Strings with Numerical values
data_inputs["pclass"] = data_inputs["pclass"].replace("3rd", 3)
data_inputs["pclass"] = data_inputs["pclass"].replace("2nd", 2)
data_inputs["pclass"] = data_inputs["pclass"].replace("1st", 1)
data_inputs.head()

Unnamed: 0,pclass,age,sex
0,3,29.0,female
1,1,65.0,male
2,2,59.0,male
3,1,46.0,male
4,3,29.0,male


In [70]:
# We need to conver the data into something a machine can understand, so lets replace gender with 0 (female), 1 (male)
data_inputs["sex"] = np.where(data_inputs["sex"] == "female", 0, 1)
data_inputs.head()

Unnamed: 0,pclass,age,sex
0,3,29.0,0
1,1,65.0,1
2,2,59.0,1
3,1,46.0,1
4,3,29.0,1


In [72]:
# splitting the data into a test run and the training run
# we mark the expected values, then the inputs, amount of training data iterations,
inputs_train, inputs_test, expected_output_train, expected_output_test   = train_test_split (data_inputs, expected_output, test_size = 0.33, random_state = 42)

print(inputs_train.head())
print(expected_output_train.head())

    pclass   age  sex
618      3  19.0    1
169      3  29.0    1
830      1  54.0    1
140      3  29.0    1
173      2  28.0    1
     survived
618         0
169         0
830         1
140         0
173         0


In [74]:
# Moment of truth : running machine learning
# rf is random forest
# n_estimators is just a setting, 100 is default
# ignore the warning
# the randomforest classifier is the object and it's values
rf = RandomForestClassifier (n_estimators=100)


In [76]:
rf.fit(inputs_train, expected_output_train.values.ravel())

In [78]:
# now to test how accurate our ML model is (like testing a dog on tricks)
# so we put in the input, and then compare it to the expected output
accuracy = rf.score(inputs_test, expected_output_test)
print("Accuracy = {}%".format(accuracy * 100))

Accuracy = 78.61842105263158%


In [80]:
# this saves the model for future use
joblib.dump(rf, "titanic_model1", compress=9)

['titanic_model1']