In [50]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import os
import statsmodels.formula.api as smf

In [51]:
os.chdir(os.path.join('..','data'))
df = pd.read_csv(r'2002FemPreg.csv',low_memory=False)
df = df[df['outcome']==1] # live births
df = df.dropna(subset=['agepreg','totalwgt_lb'])

resp = pd.read_csv('respondent.csv')
df2 = df[df['prglngth']>30]
df_joined = df2.join(resp,on='caseid',rsuffix='_r')

In [52]:
df.head()

Unnamed: 0.1,Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,0,1,1,,,,,6.0,,1.0,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,1,2,,,,,6.0,,1.0,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,2,1,,,,,5.0,,3.0,...,0,0,0,7226.30174,8567.54911,12999.54226,2,12,,9.125
3,3,2,2,,,,,6.0,,1.0,...,0,0,0,7226.30174,8567.54911,12999.54226,2,12,,7.0
4,4,2,3,,,,,6.0,,1.0,...,0,0,0,7226.30174,8567.54911,12999.54226,2,12,,6.1875


In [53]:
r_squared = {}
for column in df_joined.columns:
    try:
        if df_joined[column].var() < 1e-7:
            continue
            
        formula = 'prglngth ~ '+column
        model = smf.ols(formula,data=df_joined)
        results = model.fit()
        
        if model.nobs < len(df_joined)/2:
            continue
            
    except(ValueError,TypeError,SyntaxError):
        continue
        
    
    r_squared[column] = results.rsquared  

In [54]:
d = {'a':1,'b':20,'c':4,'d':10}
#lambda function for returning the value given the dict
sorted(d.items(),key=lambda x:x[1],reverse=True)

[('b', 20), ('d', 10), ('c', 4), ('a', 1)]

In [55]:
l = [1,2,3,4,5]
double = lambda x: [y*2 for y in x]
double(l)

[2, 4, 6, 8, 10]

In [56]:
# this is how the sorted(dictionary)
# the dictionary is converted to list by the sorted function
for i in list(d.items()):
    print(i[1])

1
20
4
10


In [57]:
r_sqr_results = sorted(r_squared.items(),key=lambda x:x[1],reverse=True)
r_sqr_results[0:20]

[('prglngth', 1.0),
 ('wksgest', 0.8875643082722343),
 ('mosgest', 0.17405830733526173),
 ('totalwgt_lb', 0.12445743148120203),
 ('birthwgt_lb', 0.1208198963399102),
 ('lbw1', 0.10488286362212795),
 ('prglngth_i', 0.0217754416128203),
 ('nbrnaliv', 0.004657644489844848),
 ('mardat02_i', 0.0031025050394146714),
 ('oldwp07_i', 0.0028972314327386783),
 ('oldwr07_i', 0.0028972314327386783),
 ('wantrp07_i', 0.0028972314327386783),
 ('parts12', 0.002790872597017824),
 ('rmarout07_i', 0.0027722899307861537),
 ('condomr_i', 0.002485395350625197),
 ('anynurse', 0.0024709815467778284),
 ('bfeedwks', 0.0024037317639812317),
 ('mon12prt', 0.0023447326245984446),
 ('pregend1', 0.002162320014431729),
 ('intr_ec3', 0.002097864646328107)]

In [58]:
formula1 = ('prglngth ~ birthord+paydu==1+totincr+race==2+nbrnaliv>1')
results1 = smf.ols(formula1,data=df_joined).fit()

In [59]:
results1.summary()

0,1,2,3
Dep. Variable:,prglngth,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,10.17
Date:,"Fri, 30 Nov 2018",Prob (F-statistic):,1.02e-09
Time:,18:32:47,Log-Likelihood:,-11185.0
No. Observations:,5415,AIC:,22380.0
Df Residuals:,5409,BIC:,22420.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,38.9358,0.088,440.768,0.000,38.763,39.109
paydu == 1[T.True],0.0761,0.056,1.355,0.175,-0.034,0.186
race == 2[T.True],0.1404,0.055,2.543,0.011,0.032,0.249
nbrnaliv > 1[T.True],-1.2183,0.203,-5.987,0.000,-1.617,-0.819
birthord,-0.0430,0.025,-1.736,0.083,-0.092,0.006
totincr,-0.0110,0.007,-1.530,0.126,-0.025,0.003

0,1,2,3
Omnibus:,977.766,Durbin-Watson:,1.661
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3785.444
Skew:,-0.858,Prob(JB):,0.0
Kurtosis:,6.72,Cond. No.,76.6


Exercise 11.2 
----

The Trivers-Willard hypothesis suggests that for many mammals the sex ratio depends on \maternal condition"; that is, factors like the
mother’s age, size, health, and social status. 

See https://en.wikipedia.org/wiki/Trivers-Willard_hypothesis

As an exercise, use a data mining approach to test the other variables in the
pregnancy and respondent files. Can you find any factors with a substantial
effect

In [60]:
df_joined['boy'] = (df_joined['babysex']==1).astype(int)
df_joined.head()

Unnamed: 0.1,Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,...,basewgt_r,adj_mod_basewgt_r,finalwgt_r,secu_r,sest_r,cmintvw_r,cmlstyr,screentime,intvlngth,boy
0,0,1,1,,,,,6.0,,1.0,...,2335.279149,2846.79949,4744.19135,2.0,18.0,1233.0,1221.0,16:30:59,64.294,1
1,1,1,2,,,,,6.0,,1.0,...,2335.279149,2846.79949,4744.19135,2.0,18.0,1233.0,1221.0,16:30:59,64.294,0
2,2,2,1,,,,,5.0,,3.0,...,2335.279149,2846.79949,4744.19135,2.0,18.0,1234.0,1222.0,18:19:09,75.149167,1
3,3,2,2,,,,,6.0,,1.0,...,2335.279149,2846.79949,4744.19135,2.0,18.0,1234.0,1222.0,18:19:09,75.149167,0
4,4,2,3,,,,,6.0,,1.0,...,2335.279149,2846.79949,4744.19135,2.0,18.0,1234.0,1222.0,18:19:09,75.149167,0


In [61]:
def data_mining(df):
    r_squared = []
    for column in df.columns:
        try:
            if df[column].var() < 1e-7:
                continue
            formula = 'boy ~ agepreg+'+column
            model = smf.logit(formula,data=df)
            nobs = len(model.endog)
            if nobs < len(df)/2:
                continue
                
            results = model.fit()
        except:
            continue

        r_squared.append((column,results.prsquared))
        
    return r_squared

In [62]:
r_squared = data_mining(df_joined)

Optimization terminated successfully.
         Current function value: 0.693022
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692998
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692931
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692861
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693014
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692900
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692810
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.693020
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693000
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692988
  

Optimization terminated successfully.
         Current function value: 0.692960
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.693014
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693001
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693017
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692744
         Iterations 6
         Current function value: 0.692860
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.692987
         Iterations 4




Optimization terminated successfully.
         Current function value: 0.692814
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692987
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693017
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692987
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693021
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.693004
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.693022
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692999
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692993
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692867
  



Optimization terminated successfully.
         Current function value: 0.692114
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692918
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692916
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692750
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692671
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692907
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692896
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692682
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692688
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692694
  

Optimization terminated successfully.
         Current function value: 0.692620
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692548
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692661
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692679
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692851
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692748
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692629
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692374
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692656
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692627
  

Optimization terminated successfully.
         Current function value: 0.692687
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692589
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692662
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692503
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692679
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692537
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692680
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692449
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692676
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692617
  

Optimization terminated successfully.
         Current function value: 0.692506
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692924
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692923
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692924
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692923
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692870
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692887
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692462
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692898
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692916
  



Optimization terminated successfully.
         Current function value: 0.692901
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692853
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692841
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692907
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692601
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692614
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692258
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692910
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692910
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692877
  



Optimization terminated successfully.
         Current function value: 0.692889
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692897
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692897
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692908
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692882
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692838
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692871
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692818
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692790
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692905
  

Optimization terminated successfully.
         Current function value: 0.692353
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692733
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692770
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692769
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692764
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692738
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692748
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692522
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692923
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692897
  



Optimization terminated successfully.
         Current function value: 0.692602
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692828
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692848
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692892
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692818
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692921
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692732
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692860
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692924
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692870
  



Optimization terminated successfully.
         Current function value: 0.692766
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692924
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692707
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692819
         Iterations 4
         Current function value: 0.692550
         Iterations: 35
         Current function value: 0.692674
         Iterations: 35




Optimization terminated successfully.
         Current function value: 0.692493
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692766
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692924
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692707
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692819
         Iterations 4
         Current function value: 0.692550
         Iterations: 35




         Current function value: 0.692674
         Iterations: 35
Optimization terminated successfully.
         Current function value: 0.692886
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692418
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692870
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692813
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692776
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692669
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692757
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692897
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692916
         Iterations 3
Optimization ter

In [63]:
r_squared_filter = [x for x in r_squared if x[1]<=1.0]

In [64]:
sorted(r_squared_filter, key = lambda x:x[1], reverse=True)

[('totalwgt_lb', 0.009696855253233383),
 ('birthwgt_lb', 0.009534379428548956),
 ('patch', 0.0015451370299872647),
 ('oldwp02_i', 0.0012606541807305138),
 ('methhist261', 0.0012448102014324114),
 ('methhist271', 0.0011859388695731887),
 ('hasbabes', 0.0011764519339452217),
 ('lbw1', 0.001159305832779678),
 ('cmendmc', 0.0011237869824899382),
 ('selfinc', 0.0011069816061936022),
 ('methhist281', 0.0010887790023952348),
 ('intent', 0.001020812412501182),
 ('fmarout5', 0.0010203622658081501),
 ('interest', 0.001007651343876148),
 ('momworkd', 0.0009712403801300784),
 ('numbabes', 0.0009693421966596727),
 ('parity_r', 0.0009693421966596727),
 ('endo', 0.0009520750014559987),
 ('rmarout6', 0.0009389322102135722),
 ('bothbiol', 0.0008769336168821251),
 ('methhist251', 0.0008519332105711985),
 ('cohstat', 0.0008427842399699736),
 ('wthparnw', 0.0008390002942512131),
 ('pmarpreg', 0.0008238618192750735),
 ('evmarcoh', 0.0008038864361120668),
 ('monsx1197', 0.0008014991257738746),
 ('fmethod1',

Exercise 11.3
-----

Poisson Regression

the quantity you want to predict is a count

to predict how many children a woman has born; in the NSFG
dataset, this variable is called numbabes.

Suppose you meet a woman who is 35 years old, black, and a college graduate
whose annual household income exceeds $75,000. How many children would
you predict she has born?

In [65]:
formula3 = 'numbabes ~ age_r + C(race) + educat + totincr'
results3 = smf.poisson(formula3,data=df_joined).fit()

Optimization terminated successfully.
         Current function value: 1.395475
         Iterations 6


In [66]:
results3.summary()

0,1,2,3
Dep. Variable:,numbabes,No. Observations:,5415.0
Model:,Poisson,Df Residuals:,5409.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 30 Nov 2018",Pseudo R-squ.:,0.1067
Time:,18:33:27,Log-Likelihood:,-7556.5
converged:,True,LL-Null:,-8459.5
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1989,0.083,-14.369,0.000,-1.362,-1.035
C(race)[T.2],-0.0354,0.028,-1.244,0.213,-0.091,0.020
C(race)[T.3],-0.1724,0.051,-3.370,0.001,-0.273,-0.072
age_r,0.0616,0.002,39.341,0.000,0.058,0.065
educat,-0.0064,0.005,-1.318,0.188,-0.016,0.003
totincr,-0.0516,0.003,-16.524,0.000,-0.058,-0.045


In [67]:
# prediction 
# age: 35, race = 1.0, college_grad= 16, household income = 14
results3.predict(pd.DataFrame([1,2,1,2]))

PatsyError: Error evaluating factor: NameError: name 'age_r' is not defined
    numbabes ~ age_r + C(race) + educat + totincr
               ^^^^^

Exercise 11.4
-----

In [72]:
formula4 = 'rmarital ~ age_r + C(race) + totincr+educat'
results4 = smf.mnlogit(formula,data=df_joined).fit()
results4.summary()

  return eXB/eXB.sum(1)[:,None]
  oldparams) > tol)):


Optimization terminated successfully.
         Current function value: nan
         Iterations 4


  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,prglngth,No. Observations:,5415.0
Model:,MNLogit,Df Residuals:,5383.0
Method:,MLE,Df Model:,16.0
Date:,"Fri, 30 Nov 2018",Pseudo R-squ.:,
Time:,18:52:30,Log-Likelihood:,
converged:,True,LL-Null:,-9366.8
,,LLR p-value:,

prglngth=32,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,,,,,,
intvlngth,,,,,,
prglngth=33,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,,,,,,
intvlngth,,,,,,
prglngth=34,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,,,,,,
intvlngth,,,,,,
prglngth=35,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,,,,,,
