# Random Forest vs Logistic Regression

In [1]:
from pandas import DataFrame, Series
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split


In [2]:
#Read in Titanic Data
titanic = pd.read_csv("../../datasets/titanic/train.csv")

titanic_only = pd.get_dummies(titanic,columns=['Sex','Pclass','Embarked'],drop_first=True)

In [3]:
#Drop columns we don't care about (yet) or have missing values (Models don't like missing values)
titanic_only.drop(['PassengerId','Name','Ticket','Age','Cabin'],axis=1,inplace=True)

In [4]:
#Train Test Splitting
local_train, local_test = train_test_split(titanic_only,test_size=0.2,random_state=123)
local_train.shape
local_test.shape

local_train_y = local_train["Survived"]
local_train_x = local_train.drop(["Survived"],axis=1)
local_test_y = local_test["Survived"]
local_test_x = local_test.drop("Survived",axis=1)

In [5]:
#The Logistic Regression Model
clf = sm.Logit(local_train_y,local_train_x)
result = clf.fit()
preds = result.predict(local_test_x)

Optimization terminated successfully.
         Current function value: 0.497509
         Iterations 6


In [6]:
#Accuracy of Logistic Model
np.mean((preds > 0.5) == local_test_y)

0.8044692737430168

In [7]:
#The Random Forest Model
clf = RandomForestClassifier(n_estimators=100)
clf.fit(local_train_x,local_train_y)
preds = clf.predict_proba(local_test_x)
preds

array([[ 0.69      ,  0.31      ],
       [ 0.86251132,  0.13748868],
       [ 0.97333333,  0.02666667],
       [ 0.88283333,  0.11716667],
       [ 0.7875    ,  0.2125    ],
       [ 1.        ,  0.        ],
       [ 0.02741703,  0.97258297],
       [ 0.12902858,  0.87097142],
       [ 0.75916667,  0.24083333],
       [ 0.63      ,  0.37      ],
       [ 0.88      ,  0.12      ],
       [ 0.18483333,  0.81516667],
       [ 0.03      ,  0.97      ],
       [ 0.87476399,  0.12523601],
       [ 0.035     ,  0.965     ],
       [ 0.16      ,  0.84      ],
       [ 0.01      ,  0.99      ],
       [ 1.        ,  0.        ],
       [ 0.77      ,  0.23      ],
       [ 0.97016667,  0.02983333],
       [ 0.6       ,  0.4       ],
       [ 0.        ,  1.        ],
       [ 0.81926004,  0.18073996],
       [ 0.98      ,  0.02      ],
       [ 0.99      ,  0.01      ],
       [ 0.99      ,  0.01      ],
       [ 1.        ,  0.        ],
       [ 0.01      ,  0.99      ],
       [ 1.        ,

In [8]:
#Check order of classes
clf.classes_

array([0, 1])

In [9]:
#Accuracy of Logistic Model
preds.shape
np.mean((preds[:,1] > 0.5) == local_test_y)

0.82681564245810057