## Random forest uses a bunch of decision trees to create predictions; these trees are uncorrelated  
## Usually works better than 1 decision trees

In [1]:
import os
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
os.chdir("C:/Users/ab/Documents/GitHub/data-science-lectures-monte")

In [3]:
df = pd.read_csv("data/titanic_train.csv")

In [4]:
print(df.head())

   passenger_id  pclass                                               name  \
0          1216       3                                 Smyth, Miss. Julia   
1           699       3                                    Cacic, Mr. Luka   
2          1267       3  Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...   
3           449       2              Hocking, Mrs. Elizabeth (Eliza Needs)   
4           576       2                                    Veal, Mr. James   

      sex   age  sibsp  parch  ticket     fare cabin embarked boat  body  \
0  female   NaN      0      0  335432   7.7333   NaN        Q   13   NaN   
1    male  38.0      0      0  315089   8.6625   NaN        S  NaN   NaN   
2  female  30.0      1      1  345773  24.1500   NaN        S  NaN   NaN   
3  female  54.0      1      3   29105  23.0000   NaN        S    4   NaN   
4    male  40.0      0      0   28221  13.0000   NaN        S  NaN   NaN   

                  home.dest  survived  
0                       NaN       

In [5]:
# will use columns sec, age, pclass, and sibsp

columns = ["sex", "age", "pclass", "sibsp"]

# need to clean the data a bit, make sex columns binary
df.loc[df["sex"]=="male", "sex"] = 0
df.loc[df["sex"]=="female", "sex"] = 1

df["age"] = df["age"].fillna(df["age"].median())

In [6]:
x = df[columns]
y = df["survived"]

In [7]:
# create the random forest algorithm, and fit our data on it

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x, y)

print(rf.score(x, y))

0.9011764705882352


In [8]:
# extremely high accuracy, highest we've seen
# now we predict the trianing values and place it agianst the actual y values, and see the false positives and negatives

y_pred = rf.predict(x)
print(confusion_matrix(y, y_pred))

[[502  35]
 [ 49 264]]


In [9]:
test = pd.read_csv("data/titanic_test.csv")

test.loc[test["sex"]=="male", "sex"] = 0
test.loc[test["sex"]=="female", "sex"] = 1

test["age"] = test["age"].fillna(test["age"].median())

In [10]:
x_test = test[columns]

In [11]:
pred = rf.predict(x_test)
print(pred)

[0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1
 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0
 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1
 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 1 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1
 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0
 1 0 0 0 0 1 0 0 1 1 0 0 1 0 1]


In [12]:
dataset = pd.DataFrame(list(zip(test["passenger_id"].to_list(), pred.tolist())), columns=["passenger_id", "survived"])

In [13]:
print(dataset.head())

   passenger_id  survived
0           295         0
1          1150         0
2            89         1
3          1063         0
4          1020         0


In [14]:
os.chdir("C:/Users/ab/Documents/GitHub/data-science-lectures-monte/submissions")
dataset.to_csv("submission_random_forest.csv", index=False)

## The accuracy ends up being a 65%, which is low probably because we didnt choose the right variables  
## could also be the number of decision trees created