In [632]:
import pandas as pd
import numpy as np

loc = 'titanic3.xls'
dfraw=pd.read_excel(loc)

In [633]:
dfraw

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0000,0,0,112050,0.0000,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


# Data Cleaning

### This dataset needs dummies for pclass

In [634]:
dfraw['pclassone'] = np.where((dfraw['pclass']==1),1,0)
dfraw['pclasstwo'] = np.where((dfraw['pclass']==2),1,0)

### There are no missing values for pclass

In [635]:
len(dfraw[dfraw["pclass"].isnull()])

0

### This dataset needs some dummies for sex

In [636]:
dfraw['sexnum'] = np.where((dfraw["sex"]=="male"),1,0)

### Some research about the titanic tells me that cabins correspond to different levels of the ship, and the higher your class was the higher your cabin. To avoid collinearity, and to make my life easier, I am making a decision to leave cabin out as a predictor

In [637]:
len(dfraw[dfraw["home.dest"].isnull()])

564

### Similar things could be said about home destination, and since there are 564 missing destinations, this seems like a good variable to leave alone

In [638]:
len(dfraw[dfraw["embarked"].isnull()])

2

### Embarked has only two missing ports of departure, so dummies will be codes for the ports we have

In [639]:
pd.pivot_table(dfraw, index = "embarked", values = "survived")

Unnamed: 0_level_0,survived
embarked,Unnamed: 1_level_1
C,0.555556
Q,0.357724
S,0.332604


### Here we can see there are three categories, so we need three dummies with the default being the two people who didn't have a port of departure

In [640]:
dfraw['portc'] = np.where((dfraw['embarked']=="C"),1,0)
dfraw['portq'] = np.where((dfraw['embarked']=="Q"),1,0)
dfraw['ports'] = np.where((dfraw['embarked']=="S"),1,0)

In [641]:
dfraw.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,pclassone,pclasstwo,sexnum,portc,portq,ports
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO",1,0,0,0,0,1
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",1,0,1,0,0,1
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",1,0,0,0,0,1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",1,0,1,0,0,1
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",1,0,0,0,0,1


In [642]:
# our df has so many columns its hard to see what we're doing
dfraw[['embarked', "ports", "portc", "portq"]].head(20)

Unnamed: 0,embarked,ports,portc,portq
0,S,1,0,0
1,S,1,0,0
2,S,1,0,0
3,S,1,0,0
4,S,1,0,0
5,S,1,0,0
6,S,1,0,0
7,S,1,0,0
8,S,1,0,0
9,C,0,1,0


### Unsurpisingly 'body', which I believe is the person's weight, has so many missing values that it doesn't correlate with anything.

In [643]:
# There are 1188 missing observations under body
len(dfraw[dfraw["body"].isnull()])

1188

In [644]:
# out of a 1309 observation dataset
len(dfraw)

1309

### Is the survival probability for those with a missing bodyweight different than those with a recorded bodyweight?

In [645]:
dfraw["hasbody"] = np.where((dfraw["body"].isnull()),0,1)

pd.pivot_table(dfraw, index = "survived", values="hasbody")

Unnamed: 0_level_0,hasbody
survived,Unnamed: 1_level_1
0,0.149567
1,0.0


### None of the survivors had their body weights recorded, so it seems unlikely that bodyweight observations could be usefull in predicting survival.

In [646]:
len(dfraw[dfraw["age"].isnull()])

263

### age has 263 missing observations

In [647]:
dfraw["hasage"] = np.where(dfraw["age"].isnull(),0,1)
dfraw.loc[dfraw["age"].isnull()]

pd.pivot_table(dfraw, index = 'survived', values = "hasage")

Unnamed: 0_level_0,hasage
survived,Unnamed: 1_level_1
0,0.765142
1,0.854


### 76.5% of those who didn't survive had an age and 85.4% of those who did survive didn't have an age. Given that well over half of those who did and didn't make it both had a recorded age, it seems unlikely that removing age or replacing age with a mean age would be too problematic for our analysis

In [648]:
dfraw["meanage"] = dfraw.age.fillna(dfraw.age.mean())

### I've decided to add a new variable that replaces NaN observations for age with the mean age

### Fare has one nan, so lets drop it

In [649]:
#len(dfraw[dfraw["fare"].isnull()])

In [650]:
#dfraw = dfraw.dropna(subset = ["fare"], )

In [651]:
#len(dfraw[dfraw["fare"].isnull()])

In [652]:
len(dfraw[dfraw["sibsp"].isnull()])

0

In [653]:
len(dfraw[dfraw["parch"].isnull()])

0

### sibsp and parch have no NaN observations

In [654]:
len(dfraw[dfraw["sibsp"].isnull()])

0

In [655]:
len(dfraw[dfraw["parch"].isnull()])

0

### sibsp and parch have no NaN observations

### I'd also like to make a dummy variable for "hasfam" short for "has family" because having family on board might give you more will to live.

In [656]:
dfraw['hasfam'] = np.where(((dfraw["sibsp"] > 0) & (dfraw["parch"] > 0)),1,0)

### After the cleaning we have these new columns

In [657]:
dfraw

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,pclassone,pclasstwo,sexnum,portc,portq,ports,hasbody,hasage,meanage,hasfam
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,...,1,0,0,0,0,1,0,1,29.000000,0
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,...,1,0,1,0,0,1,0,1,0.916700,1
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,...,1,0,0,0,0,1,0,1,2.000000,1
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,...,1,0,1,0,0,1,1,1,30.000000,1
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,...,1,0,0,0,0,1,0,1,25.000000,1
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,...,1,0,1,0,0,1,0,1,48.000000,0
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,...,1,0,0,0,0,1,0,1,63.000000,0
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0000,0,0,112050,0.0000,A36,...,1,0,1,0,0,1,0,1,39.000000,0
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,...,1,0,0,0,0,1,0,1,53.000000,0
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,...,1,0,1,1,0,0,1,1,71.000000,0


### The one variable I have overlooked so far is ticket, but I'm not sure I can cleanly extract what information is contained there, so I am simply leaving it out for the time being.

# Preliminary Analysis

### The variables we are left with to correlate with survived are: pclassone, pclasstwo, pclassthree(implied), sexnum, meanage, sibsp, parch, fare, portc, portq, noport(implied), ports and hasfam

In [658]:
dfraw[["survived", "pclassone", "pclasstwo", "sexnum", "meanage", "sibsp", "parch", "fare", "portc", "portq", "ports", "hasfam"]].corr()

Unnamed: 0,survived,pclassone,pclasstwo,sexnum,meanage,sibsp,parch,fare,portc,portq,ports,hasfam
survived,1.0,0.279449,0.05079,-0.528693,-0.050199,-0.027825,0.08266,0.244265,0.182123,-0.016071,-0.154558,0.066122
pclassone,0.279449,1.0,-0.296526,-0.107371,0.362587,-0.034256,-0.013033,0.600031,0.325722,-0.166101,-0.187353,-0.038107
pclasstwo,0.05079,-0.296526,1.0,-0.028862,-0.014193,-0.052419,-0.010057,-0.121384,-0.134675,-0.121973,0.197973,0.017504
sexnum,-0.528693,-0.107371,-0.028862,1.0,0.057398,-0.109609,-0.213125,-0.185523,-0.066564,-0.088651,0.119504,-0.138735
meanage,-0.050199,0.362587,-0.014193,0.057398,1.0,-0.190747,-0.130872,0.171892,0.076179,-0.012718,-0.064267,-0.220648
sibsp,-0.027825,-0.034256,-0.052419,-0.109609,-0.190747,1.0,0.373587,0.160238,-0.048396,-0.048678,0.075198,0.616986
parch,0.08266,-0.013033,-0.010057,-0.213125,-0.130872,0.373587,1.0,0.221539,-0.008635,-0.100943,0.073258,0.636915
fare,0.244265,0.600031,-0.121384,-0.185523,0.171892,0.160238,0.221539,1.0,0.286269,-0.130059,-0.172683,0.184786
portc,0.182123,0.325722,-0.134675,-0.066564,0.076179,-0.048396,-0.008635,0.286269,1.0,-0.164166,-0.775441,-0.023282
portq,-0.016071,-0.166101,-0.121973,-0.088651,-0.012718,-0.048678,-0.100943,-0.130059,-0.164166,1.0,-0.489874,-0.088844


### The correlations look non-zero for the most part. I will leave all of these variables in when running the first regression

# Regression Analysis

### Run a lineary probability model using the above coefficients

In [659]:
import statsmodels.formula.api as sm
reg1 = sm.ols(formula="survived ~ pclassone + pclasstwo + sexnum + meanage + sibsp + parch + fare + portc + portq + ports + hasfam", data =  dfraw).fit()
reg1.summary()

0,1,2,3
Dep. Variable:,survived,R-squared:,0.373
Model:,OLS,Adj. R-squared:,0.368
Method:,Least Squares,F-statistic:,70.23
Date:,"Fri, 12 Oct 2018",Prob (F-statistic):,2.7800000000000003e-123
Time:,23:41:31,Log-Likelihood:,-606.25
No. Observations:,1308,AIC:,1237.0
Df Residuals:,1296,BIC:,1299.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9430,0.277,3.406,0.001,0.400,1.486
pclassone,0.3125,0.036,8.670,0.000,0.242,0.383
pclasstwo,0.1471,0.029,5.157,0.000,0.091,0.203
sexnum,-0.4949,0.024,-20.991,0.000,-0.541,-0.449
meanage,-0.0054,0.001,-5.821,0.000,-0.007,-0.004
sibsp,-0.0575,0.013,-4.347,0.000,-0.083,-0.032
parch,-0.0224,0.017,-1.353,0.176,-0.055,0.010
fare,0.0002,0.000,0.676,0.499,-0.000,0.001
portc,-0.1008,0.275,-0.366,0.714,-0.641,0.439

0,1,2,3
Omnibus:,52.08,Durbin-Watson:,1.813
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57.746
Skew:,0.515,Prob(JB):,2.89e-13
Kurtosis:,3.001,Cond. No.,3330.0


### The coefficient of fare seems rather small, and perhaps is not suiting our analysis well. Instead lets make it a log fare and see what a percent increase in fare amount does to your chances of survival on the titanic

In [660]:
# np.log(df['norm'])
dfraw["logfare"] = np.log(dfraw["fare"])

  


### Apparently someone paid a zero dollar fare, that doesn't help

In [661]:
dfraw.loc[dfraw["fare"]==0]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,pclasstwo,sexnum,portc,portq,ports,hasbody,hasage,meanage,hasfam,logfare
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,...,0,1,0,0,1,0,1,39.0,0,-inf
70,1,0,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0.0,,...,0,1,0,0,1,0,0,29.881135,0,-inf
125,1,0,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,...,0,1,0,0,1,0,0,29.881135,0,-inf
150,1,0,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,...,0,1,0,0,1,1,1,40.0,0,-inf
170,1,1,"Ismay, Mr. Joseph Bruce",male,49.0,0,0,112058,0.0,B52 B54 B56,...,0,1,0,0,1,0,1,49.0,0,-inf
223,1,0,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,...,0,1,0,0,1,0,0,29.881135,0,-inf
234,1,0,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,...,0,1,0,0,1,0,1,38.0,0,-inf
363,2,0,"Campbell, Mr. William",male,,0,0,239853,0.0,,...,1,1,0,0,1,0,0,29.881135,0,-inf
384,2,0,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,,...,1,1,0,0,1,0,0,29.881135,0,-inf
410,2,0,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,...,1,1,0,0,1,0,0,29.881135,0,-inf


In [662]:
dfraw = dfraw.replace([-np.inf,], 0)

In [663]:
dfraw.fare.loc[dfraw["logfare"]==0] 

7       0.0
70      0.0
125     0.0
150     0.0
170     0.0
223     0.0
234     0.0
363     0.0
384     0.0
410     0.0
473     0.0
528     0.0
581     0.0
896     0.0
898     0.0
963     0.0
1254    0.0
Name: fare, dtype: float64

In [664]:
dfraw

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,pclasstwo,sexnum,portc,portq,ports,hasbody,hasage,meanage,hasfam,logfare
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,...,0,0,0,0,1,0,1,29.000000,0,5.353456
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,...,0,1,0,0,1,0,1,0.916700,1,5.020916
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,...,0,0,0,0,1,0,1,2.000000,1,5.020916
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,...,0,1,0,0,1,1,1,30.000000,1,5.020916
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,...,0,0,0,0,1,0,1,25.000000,1,5.020916
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,...,0,1,0,0,1,0,1,48.000000,0,3.279030
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,...,0,0,0,0,1,0,1,63.000000,0,4.356174
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0000,0,0,112050,0.0000,A36,...,0,1,0,0,1,0,1,39.000000,0,0.000000
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,...,0,0,0,0,1,0,1,53.000000,0,3.941178
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,...,0,1,1,0,0,1,1,71.000000,0,3.902058


### It appears we have successfully replaced all of the -infinities with zero, so that we now have a proper variable for logfare, lets regress a second time

In [665]:
reg2 = sm.ols(formula="survived ~ pclassone + pclasstwo + sexnum + meanage + sibsp + parch + logfare + portc + portq + ports + hasfam", data =  dfraw).fit()
reg2.summary()

0,1,2,3
Dep. Variable:,survived,R-squared:,0.374
Model:,OLS,Adj. R-squared:,0.369
Method:,Least Squares,F-statistic:,70.52
Date:,"Fri, 12 Oct 2018",Prob (F-statistic):,1.03e-123
Time:,23:41:32,Log-Likelihood:,-605.24
No. Observations:,1308,AIC:,1234.0
Df Residuals:,1296,BIC:,1297.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8623,0.282,3.062,0.002,0.310,1.415
pclassone,0.2746,0.044,6.188,0.000,0.188,0.362
pclasstwo,0.1326,0.030,4.388,0.000,0.073,0.192
sexnum,-0.4908,0.024,-20.679,0.000,-0.537,-0.444
meanage,-0.0054,0.001,-5.798,0.000,-0.007,-0.004
sibsp,-0.0640,0.014,-4.581,0.000,-0.091,-0.037
parch,-0.0281,0.017,-1.654,0.098,-0.061,0.005
logfare,0.0302,0.019,1.573,0.116,-0.007,0.068
portc,-0.0901,0.275,-0.328,0.743,-0.630,0.449

0,1,2,3
Omnibus:,51.895,Durbin-Watson:,1.815
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57.53
Skew:,0.514,Prob(JB):,3.22e-13
Kurtosis:,2.994,Cond. No.,1680.0


### The p value is much improved, but it's still not significant. We can also try a squared fare

In [666]:
dfraw["faresquare"] = dfraw["fare"]*dfraw["fare"]

In [667]:
reg3 = sm.ols(formula="survived ~ pclassone + pclasstwo + sexnum + meanage + sibsp + parch + faresquare + portc + portq + ports + hasfam", data =  dfraw).fit()
reg3.summary()

0,1,2,3
Dep. Variable:,survived,R-squared:,0.374
Model:,OLS,Adj. R-squared:,0.368
Method:,Least Squares,F-statistic:,70.29
Date:,"Fri, 12 Oct 2018",Prob (F-statistic):,2.2700000000000003e-123
Time:,23:41:32,Log-Likelihood:,-606.05
No. Observations:,1308,AIC:,1236.0
Df Residuals:,1296,BIC:,1298.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.9490,0.277,3.427,0.001,0.406,1.492
pclassone,0.3180,0.031,10.139,0.000,0.257,0.380
pclasstwo,0.1485,0.028,5.228,0.000,0.093,0.204
sexnum,-0.4954,0.024,-21.066,0.000,-0.542,-0.449
meanage,-0.0054,0.001,-5.832,0.000,-0.007,-0.004
sibsp,-0.0566,0.013,-4.303,0.000,-0.082,-0.031
parch,-0.0218,0.016,-1.334,0.183,-0.054,0.010
faresquare,6.213e-07,6.68e-07,0.931,0.352,-6.88e-07,1.93e-06
portc,-0.1049,0.275,-0.381,0.703,-0.645,0.435

0,1,2,3
Omnibus:,52.133,Durbin-Watson:,1.811
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57.804
Skew:,0.515,Prob(JB):,2.81e-13
Kurtosis:,3.005,Cond. No.,912000.0


### This specification isn't yield results that are as good as I would have hoped, lets stick with logfare, and this time remove the port of embarkation

In [668]:
reg4 = sm.ols(formula="survived ~ pclassone + pclasstwo + sexnum + meanage + sibsp + parch + logfare + hasfam", data =  dfraw).fit()
reg4.summary()

0,1,2,3
Dep. Variable:,survived,R-squared:,0.369
Model:,OLS,Adj. R-squared:,0.365
Method:,Least Squares,F-statistic:,95.11
Date:,"Fri, 12 Oct 2018",Prob (F-statistic):,2.1e-124
Time:,23:41:32,Log-Likelihood:,-610.51
No. Observations:,1308,AIC:,1239.0
Df Residuals:,1299,BIC:,1286.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6851,0.055,12.411,0.000,0.577,0.793
pclassone,0.2838,0.044,6.456,0.000,0.198,0.370
pclasstwo,0.1206,0.030,4.073,0.000,0.063,0.179
sexnum,-0.4949,0.023,-21.104,0.000,-0.541,-0.449
meanage,-0.0055,0.001,-5.878,0.000,-0.007,-0.004
sibsp,-0.0684,0.014,-4.907,0.000,-0.096,-0.041
parch,-0.0316,0.017,-1.861,0.063,-0.065,0.002
logfare,0.0389,0.019,2.041,0.041,0.002,0.076
hasfam,0.1022,0.045,2.249,0.025,0.013,0.191

0,1,2,3
Omnibus:,50.264,Durbin-Watson:,1.81
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55.542
Skew:,0.505,Prob(JB):,8.69e-13
Kurtosis:,2.982,Cond. No.,204.0


### Most of our predictors are now calibrated to be significant within this linear probability model. Our adjusted r-squared is still a bit low, but we can continue to look for predictors within the data. One specification we can try is including an agesquare coefficient

In [669]:
dfraw["agesquare"] = dfraw["meanage"]*dfraw["meanage"]

In [670]:
reg5 = sm.ols(formula="survived ~ pclassone + pclasstwo + sexnum + meanage + agesquare + sibsp + parch + logfare + hasfam", data =  dfraw).fit()
reg5.summary()

0,1,2,3
Dep. Variable:,survived,R-squared:,0.372
Model:,OLS,Adj. R-squared:,0.368
Method:,Least Squares,F-statistic:,85.39
Date:,"Fri, 12 Oct 2018",Prob (F-statistic):,1.61e-124
Time:,23:41:33,Log-Likelihood:,-607.89
No. Observations:,1308,AIC:,1236.0
Df Residuals:,1298,BIC:,1288.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.7797,0.069,11.307,0.000,0.644,0.915
pclassone,0.2765,0.044,6.282,0.000,0.190,0.363
pclasstwo,0.1171,0.030,3.956,0.000,0.059,0.175
sexnum,-0.4957,0.023,-21.173,0.000,-0.542,-0.450
meanage,-0.0118,0.003,-4.038,0.000,-0.017,-0.006
agesquare,9.521e-05,4.17e-05,2.283,0.023,1.34e-05,0.000
sibsp,-0.0689,0.014,-4.951,0.000,-0.096,-0.042
parch,-0.0329,0.017,-1.942,0.052,-0.066,0.000
logfare,0.0389,0.019,2.045,0.041,0.002,0.076

0,1,2,3
Omnibus:,48.704,Durbin-Watson:,1.806
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.573
Skew:,0.496,Prob(JB):,2.33e-12
Kurtosis:,3.017,Cond. No.,9680.0


### This did nothing for us...

# Regression Results & Modeling

In [685]:
reg4.params

Intercept    0.685105
pclassone    0.283839
pclasstwo    0.120577
sexnum      -0.494875
meanage     -0.005455
sibsp       -0.068376
parch       -0.031563
logfare      0.038885
hasfam       0.102212
dtype: float64

In [686]:
betas = reg4.params

In [687]:
test = dfraw.loc[1226,["pclassone", "pclasstwo", "sexnum", "meanage", "sibsp", "parch", "logfare", "hasfam"]]

In [688]:
test

pclassone          0
pclasstwo          0
sexnum             1
meanage           19
sibsp              0
parch              0
logfare      2.06633
hasfam             0
Name: 1226, dtype: object

In [689]:
predict(test)

-0.5181752244822657

In [690]:
def predict(x):
    y=x*betas
    z=y.sum()
    return(z)

In [691]:
dfraw["chanceofsurvival"] = 0
for i, row in enumerate(dfraw.values):
                       dfraw.loc[i,["chanceofsurvival"]] = predict(dfraw.loc[i,["pclassone", "pclasstwo", "sexnum", "meanage", "sibsp", "parch", "logfare", "hasfam"]])

In [679]:
dfraw

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,portq,ports,hasbody,hasage,meanage,hasfam,logfare,faresquare,agesquare,chanceofsurvival
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,...,0,1,0,1,29.000000,0,5.353456,44663.538906,841.000000,0.333807
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,...,0,1,0,1,0.916700,1,5.020916,22967.402500,0.840339,-0.050086
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,...,0,1,0,1,2.000000,1,5.020916,22967.402500,4.000000,0.438879
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,...,0,1,1,1,30.000000,1,5.020916,22967.402500,900.000000,-0.208744
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,...,0,1,0,1,25.000000,1,5.020916,22967.402500,625.000000,0.313407
5,1,1,"Anderson, Mr. Harry",male,48.0000,0,0,19952,26.5500,E12,...,0,1,0,1,48.000000,0,3.279030,704.902500,2304.000000,-0.345383
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1,0,13502,77.9583,D7,...,0,1,0,1,63.000000,0,4.356174,6077.496539,3969.000000,0.041172
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0000,0,0,112050,0.0000,A36,...,0,1,0,1,39.000000,0,0.000000,0.000000,1521.000000,-0.423792
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2,0,11769,51.4792,C101,...,0,1,0,1,53.000000,0,3.941178,2650.108033,2809.000000,0.011211
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0000,0,0,PC 17609,49.5042,,...,0,0,1,1,71.000000,0,3.902058,2450.665818,5041.000000,-0.446628


In [680]:
dfraw.sort_values("chanceofsurvival", ascending = False)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,portq,ports,hasbody,hasage,meanage,hasfam,logfare,faresquare,agesquare,chanceofsurvival
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.5500,C22 C26,...,0,1,0,1,2.000000,1,5.020916,22967.402500,4.0000,0.438879
193,1,1,"Madill, Miss. Georgette Alexandra",female,15.0,0,1,24160,211.3375,B5,...,0,1,0,1,15.000000,0,5.353456,44663.538906,225.0000,0.378618
195,1,1,"Maioni, Miss. Roberta",female,16.0,0,0,110152,86.5000,B79,...,0,1,0,1,16.000000,0,4.460144,7482.250000,256.0000,0.369989
55,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,...,0,1,0,1,14.000000,1,4.787492,14400.000000,196.0000,0.364339
73,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.5500,,...,0,1,0,1,22.000000,0,5.020916,22967.402500,484.0000,0.359063
97,1,1,"Douglas, Mrs. Frederick Charles (Mary Helene B...",female,27.0,1,1,PC 17558,247.5208,B58 B60,...,0,0,0,1,27.000000,1,5.511495,61266.546433,729.0000,0.353136
190,1,1,"Longley, Miss. Gretchen Fiske",female,21.0,0,0,13502,77.9583,D9,...,0,1,0,1,21.000000,0,4.356174,6077.496539,441.0000,0.338669
24,1,1,"Bird, Miss. Ellen",female,29.0,0,0,PC 17483,221.7792,C97,...,0,1,0,1,29.000000,0,5.401682,49186.013553,841.0000,0.335682
302,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,...,0,0,0,1,35.000000,0,6.238967,262481.209173,1225.0000,0.335509
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,...,0,1,0,1,29.000000,0,5.353456,44663.538906,841.0000,0.333807


In [681]:
dfraw["somechance"] = np.where((dfraw['chanceofsurvival']>0),1,0)

In [682]:
dfraw[["survived", "somechance"]].head()

Unnamed: 0,survived,somechance
0,1,1
1,1,0
2,0,1
3,0,0
4,0,1


# Results

In [683]:
pd.pivot_table(dfraw, values="survived", index="somechance")

Unnamed: 0_level_0,survived
somechance,Unnamed: 1_level_1
0,0.257089
1,0.908367


### 90.8% of the people who my model predicted had some non-zero chance of survival, actually survived

In [684]:
pd.pivot_table(dfraw, values="somechance", index="survived")

Unnamed: 0_level_0,somechance
survived,Unnamed: 1_level_1
0,0.02843
1,0.456


### Of the people who did not survive, my model incorrectly predicted 2.8% of those people surviving

### of the people who did survive, my model only predicted 45.6% of their survival. So my model was only able to correctly predict 45% of the survivors