In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns

### Donner Party

In [2]:
donner = pd.read_csv("donner.csv")

In [3]:
donner.head()

Unnamed: 0,Age,Sex,Status
0,23,Male,Died
1,40,Female,Survived
2,40,Male,Survived
3,30,Male,Died
4,28,Male,Died


Since both categorical variables are binary, we can encode with one column zeros and ones. We also need to create an intercept column to pass to the model.

In [4]:
donner["Female"] = donner["Sex"].apply(lambda x: 1 if x=="Female" else 0)
donner["Survived"] = donner["Status"].apply(lambda x: 1 if x=="Survived" else 0)
donner["intercept"] = 1.0

In [5]:
donner.head()

Unnamed: 0,Age,Sex,Status,Female,Survived,intercept
0,23,Male,Died,0,0,1.0
1,40,Female,Survived,1,1,1.0
2,40,Male,Survived,0,1,1.0
3,30,Male,Died,0,0,1.0
4,28,Male,Died,0,0,1.0


Run a logistic regression model to predict survival based on the predictor age.

In [6]:
train_cols = ["Age", "intercept"]
logit_age = sm.Logit(donner["Survived"], donner[train_cols])
result_age = logit_age.fit()
result_age.summary()

Optimization terminated successfully.
         Current function value: 0.625452
         Iterations 6


0,1,2,3
Dep. Variable:,Survived,No. Observations:,45.0
Model:,Logit,Df Residuals:,43.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 16 Nov 2016",Pseudo R-squ.:,0.08954
Time:,20:27:45,Log-Likelihood:,-28.145
converged:,True,LL-Null:,-30.913
,,LLR p-value:,0.01863

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Age,-0.0665,0.032,-2.063,0.039,-0.130 -0.003
intercept,1.8185,0.999,1.820,0.069,-0.140 3.777


Add sex as a predictor.

In [7]:
train_cols = ["Age", "Female", "intercept"]
logit_all = sm.Logit(donner["Survived"], donner[train_cols])
result_all = logit_all.fit()
result_all.summary()

Optimization terminated successfully.
         Current function value: 0.569514
         Iterations 6


0,1,2,3
Dep. Variable:,Survived,No. Observations:,45.0
Model:,Logit,Df Residuals:,42.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 16 Nov 2016",Pseudo R-squ.:,0.171
Time:,20:27:45,Log-Likelihood:,-25.628
converged:,True,LL-Null:,-30.913
,,LLR p-value:,0.005066

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Age,-0.0782,0.037,-2.097,0.036,-0.151 -0.005
Female,1.5973,0.756,2.114,0.034,0.117 3.078
intercept,1.6331,1.110,1.471,0.141,-0.543 3.809


### Birdkeepers and Lung Cancer

In [28]:
bird = pd.read_csv("birds.csv")

In [9]:
bird.head()

Unnamed: 0,LC,FM,SS,BK,AG,YR,CD
0,LungCancer,Male,Low,Bird,37,19,12
1,LungCancer,Male,Low,Bird,41,22,15
2,LungCancer,Male,High,NoBird,43,19,15
3,LungCancer,Male,Low,Bird,46,24,15
4,LungCancer,Male,Low,Bird,49,31,20


In [29]:
bird = bird.join(pd.get_dummies(bird["LC"], prefix="LC"))
bird = bird.join(pd.get_dummies(bird["FM"], prefix="FM"))
bird = bird.join(pd.get_dummies(bird["SS"], prefix="SS"))
bird = bird.join(pd.get_dummies(bird["BK"], prefix="BK"))
bird["intercept"] = 1.0

In [31]:
bird.head()

Unnamed: 0,LC,FM,SS,BK,AG,YR,CD,LC_LungCancer,LC_NoCancer,FM_Female,FM_Male,SS_High,SS_Low,BK_Bird,BK_NoBird,intercept
0,LungCancer,Male,Low,Bird,37,19,12,1,0,0,1,0,1,1,0,1.0
1,LungCancer,Male,Low,Bird,41,22,15,1,0,0,1,0,1,1,0,1.0
2,LungCancer,Male,High,NoBird,43,19,15,1,0,0,1,1,0,0,1,1.0
3,LungCancer,Male,Low,Bird,46,24,15,1,0,0,1,0,1,1,0,1.0
4,LungCancer,Male,Low,Bird,49,31,20,1,0,0,1,0,1,1,0,1.0


In [34]:
train_cols = ["FM_Female", "SS_High", "BK_Bird", "AG", "YR", "CD", "intercept"]
logit = sm.Logit(bird["LC_LungCancer"], bird[train_cols])
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: inf
         Iterations 7


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


0,1,2,3
Dep. Variable:,LC_LungCancer,No. Observations:,147.0
Model:,Logit,Df Residuals:,140.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 16 Nov 2016",Pseudo R-squ.:,-inf
Time:,20:48:23,Log-Likelihood:,-inf
converged:,True,LL-Null:,-17376.0
,,LLR p-value:,1.0

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
FM_Female,0.5613,0.531,1.057,0.291,-0.480 1.602
SS_High,0.1054,0.469,0.225,0.822,-0.813 1.024
BK_Bird,1.3626,0.411,3.313,0.001,0.557 2.169
AG,-0.0398,0.035,-1.120,0.263,-0.109 0.030
YR,0.0729,0.026,2.751,0.006,0.021 0.125
CD,0.0260,0.026,1.019,0.308,-0.024 0.076
intercept,-1.9374,1.804,-1.074,0.283,-5.474 1.599
