# Stoneburner, Kurt
- ## DSC 530 - Week 09

In [1]:
import json

#//*** Import preg dictionary, containing variable descriptions.
with open('preg_dict.json') as json_file: 
    preg_dict = json.load(json_file) 

    

#//*** Dictionary downloaded from the NSFG site:
#//*** ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NSFG/stata/2002FemPreg.dct
#//*** Dictionary is formatted for Stata. There were no current methods to import the version of Stata into Python (that I could find)
#//*** The file is text readable, so I wrote a parser to convert the file to a dictionary and export as JSON.
#//*** Considering the criteria for exercise 11-1 is limiting variables to those that would be known to co-workers
#//*** It is helpful to have a description of the variables.
'''
import re
import json

preg_dict = {}
f = open("2002FemPreg.dct","r")
for x in f:
    if "_column" in x:
        loop_line = re.sub ("...\%....",":",x[50:].strip()).split(":")
        preg_dict[ loop_line[0].lower() ] = loop_line[1]
f.close()

preg_string = json.dumps(preg_dict,indent=4)

print(f"{preg_string}")

f = open("preg_dict.json", "w")
f.write(preg_string)
f.close()
'''
"" 

''

In [2]:
# //****************************************************************************************
# //*** Set Working Directory to thinkstats folder.
# //*** This pseudo-relative path call should work on all Stoneburner localized projects. 
# //****************************************************************************************
import os
import sys
workingPath = os.getcwd().replace("coding", "ThinkStats2\\code")
sys.path.insert(1, workingPath)
os.chdir(workingPath)

In [3]:
# //*** Imports and Load Data
import nsfg
import thinkstats2
import thinkplot
import first
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf


- ## Chapter 11, Exercise 1

**Exercise:** Suppose one of your co-workers is expecting a baby and you are participating in an office pool to predict the date of birth. Assuming that bets are placed during the 30th week of pregnancy, what variables could you use to make the best prediction? You should limit yourself to variables that are known before the birth, and likely to be available to the people in the pool.

**Response:**

I conclude that nbrnaliv, race, hispanic, and birthord have a small statistical significance R-Squared: **0.006** when used to predict pregnancy length.

R-Squared improves to **0.012** when using variables: nbrnaliv > 1, race == 2, hispanic == 2, birthord > 1

Which means:

    nbrnaliv > 1  - We assuume there is a live birth (The office pool is ruined on a different outcome)
    
    race == 2     - When Race is Black, pregancies tend to be a bit longer
    
    hispanic == 2 - Non hispanic pregnancies tend to be a bit longer 

In [4]:
import first
live, firsts, others = first.MakeFrames()
live = live[live.prglngth>30]

In [5]:
#print(f"{np.corrcoef(live.columns)}")
cor_val = []
index = []

#//*** Replace nan with 0's
live = live.replace(np.nan,0)

#//*** Correlate prglngth with each column
for col in live.columns:
    loop_val = np.corrcoef(live['prglngth'],live[col])[0][1]
    if math.isnan(loop_val) == False:
        #//*** Ignore any values under .01
        if abs(loop_val) > .01:
            cor_val.append(loop_val)
            index.append(col)

cors = pd.Series(index=index, data=cor_val).sort_values()
cor_pos = cors [ cors > 0 ]
neg_cors = cors [ cors < 0] 

#//*** Filter values based on the criteria of reasonably well known information
exclude_strings = [ "IMPUTATION FLAG",
                   "CYCLE 4 VERSION",
                   "CM FOR R'S MOST RECENT COMPLETED PREGNANCY",
                  "INFORMAL MARITAL STATUS",
                   "INFORMAL MARITAL STATUS AT PREGNANCY OUTCOME - 6 CATEGORIES",
                  "CM DATE OF CONCEPTION",
                   "CM FOR PREGNANCY END DATE (REGARDLESS OF OUTCOME)",
                  "WHETHER PREGNANCY ENDED BEFORE R'S 1ST MARRIAGE (PREMARITALLY)",
                  "WHETHER R RECEIVED PUBLIC ASSISTANCE IN 2001",
                   "EDUCATION (COMPLETED YEARS OF SCHOOLING)",
                   "BC-5 GESTATIONAL LENGTH OF PREGNANCY IN WEEKS",
                   "DURATION OF COMPLETED PREGNANCY IN WEEKS",
                   "LOW BIRTHWEIGHT - BABY 1",
                   "GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN MONTHS)",
                   "BC-1 HOW PREGNANCY ENDED - 1ST MENTION",
                   "BC-5 GESTATIONAL LENGTH OF PREGNANCY IN MONTHS",
                   "SCRAMBLED VERSION OF THE STRATUM",
                   "CM FOR R'S FIRST COMPLETED PREGNANCY",
                   "CM FOR R'S MOST RECENT LIVE BIRTH",    
                   "CM FOR DATE OF BEGINNING OF PREGNANCY INTERVAL",
                   "CM FOR BABY'S OR BABIES' DATE OF BIRTH (DELIVERY DATE)",
                   "CM FOR DATE OF END OF PREGNANCY INTERVAL",
                   "PREGNANCY ORDER (NUMBER)",
                   "EG-16 RIGHT BEF PREG, DID THE FATHER WANT R TO HAVE BABY AT ANY TIME IN FUTURE?",
                   "FORMAL MARITAL STATUS AT PREGNANCY OUTCOME",
                   "BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 2ND FROM THIS PREG",
                    "BD-2 SEX OF 2ND LIVEBORN BABY FROM THIS PREGNANCY",
                    "BG-7 IS R STILL LEGAL MOTHER OF CHILD - 2ND FROM THIS PREGNANCY" ,
                    "AGE (IN MOS) WHEN R'STOPPED NURSING CHILD - 2ND FROM THIS PREG" ,
                    "EG-5 REASON NOT USING/HAD STOPPED USING METHOD BEC. WANTED PREG?", 
                    "BG-2 WHETHER CHILD IS STILL ALIVE - 1ST FROM THIS PREGNANCY" ,
                    "BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 1ST FROM THIS PREG" ,
                    "BG-1 WHETHER CHILD LIVES WITH R - 2ND FROM THIS PREGNANCY" ,
                    "BG-1 WHETHER CHILD LIVES WITH R - 1ST FROM THIS PREGNANCY" ,
                    "AGE (IN MOS) WHEN 1ST SUPPLEMENTED - 2ND FROM THIS PREG" ,
                   "BH-2 HAS R BEGUN SUPPLEMENTATION FOR CHILD - 2ND FROM THIS PREG" ,
                    "CHECK ON WHETHER CHILD MATCHES BIO CHILD IN HH ROSTER - 1ST" ,
                    "EG-24 (UNINTENDED PREG) REASON DIDN'T USE METHOD - 2ND MENTION" ,
                    "BH-4 HAS R'STOPPED BREASTFEEDING CHILD - 2ND FROM THIS PREG" ,
                   "DURATION OF BREASTFEEDING IN WEEKS" ,
                   "BC-7 DK FOLLOWUP FOR GESTATIONAL LENGTH OF A LIVEBIRTH",
                   "CM FOR CHLD'S DATE OF DEATH - 1ST FROM THIS PREGNANCY", 
                    "CM FOR CHLD'S DATE OF DEATH - 3RD FROM THIS PREGNANCY", 
                    "EG-21 HOW HARD TRYING TO GET/AVOID PREGNANCY (0-10)", 
                    "BG-2 WHETHER CHILD IS STILL ALIVE - 3RD FROM THIS PREGNANCY" ,
                    "CM FOR DATE CHILD STOPPED LIVING W/R - 1ST FROM THIS PREGNANCY" ,
                    "BD-7 PLACE WHERE R GAVE BIRTH" ,
                    "EG-1 USE ANY METHOD IN PREGNANCY INTERVAL?" ,
                    "EG-24 (UNINTENDED PREG) REASON DIDN'T USE METHOD - 3RD MENTION" ,
                    "BG-2 WHETHER CHILD IS STILL ALIVE - 2ND FROM THIS PREGNANCY" ,
                    "PAYMENT FOR DELIVERY" ,
                    "WANTEDNESS OF PREGNANCY - RESPONDENT - CYCLE 5 VERSION" ,
                    "EG-6 RIGHT BEF PREG, WANT TO HAVE BABY AT ANY TIME IN FUTURE?" ,
                    "CM FOR DATE CHILD STOPPED LIVING W/R - 2ND FROM THIS PREGNANCY" ,
                    "BD-8 PAYMENT FOR DELIVERY - 2ND MENTION" ,
                    "EG-12B RIGHT BEF. PREG, THINK MIGHT EVER WANT TO HAVE BABY W/THAT PARTNER?" ,
                    "EG-22 HOW MUCH WANTED TO GET/AVOID PREGNANCY (0-10)" ,
                    "EG-23 (UNINTENDED PREG)" ,
                    "NUMBER OF WEEKS PREGNANT AT FIRST PRENATAL CARE" ,
                    "BG-7 IS R STILL LEGAL MOTHER OF CHILD - 1ST FROM THIS PREGNANCY" ,
                    "BG-6 LEGAL AGREEMENT FOR WHERE CHILD LIVES - 1ST FROM THIS PREG" ,
                    "BE-4 R'SMOKED AT ALL AFTER R KNEW SHE WAS PREGNANT" ,
                    "EG-8 VERIFY DIDN'T WANT BABY AT ANY TIME IN FUTURE" ,
                    "BD-8 PAYMENT FOR DELIVERY - 1ST MENTION",
                    "BG-5 WHERE CHILD LIVES NOW - 2ND FROM THIS PREGNANCY" ,
                    "CM FOR CHLD'S DATE OF DEATH - 2ND FROM THIS PREGNANCY" ,
                    "EG-7 PROBABLY WANT BABY AT ANY TIME OR NOT?" ,
                    "EG-18A WAS R LIVING W/FATHER OF PREG AT BEGINNING OF PREG" ,
                    "BG-1 WHETHER CHILD LIVES WITH R - 3RD FROM THIS PREGNANCY" ,
                    "BG-5 WHERE CHILD LIVES NOW - 1ST FROM THIS PREGNANCY" ,
                    "BF-4 WEEKS OF MATERNITY LEAVE TAKEN FOR THIS PREGNANCY", 
                    "BD-2 SEX OF 1ST LIVEBORN BABY FROM THIS PREGNANCY" ,
                    "AGE (IN MOS) WHEN CHILD LAST LIVED W/R-1ST FROM THIS PREGNANCY" ,
                    "AGE (IN MOS) WHEN CHILD LAST LIVED W/R - 2ND FROM THIS PREGNANCY", 
                    "EG-11 CHOOSE MONS OR YRS FOR HOW MUCH SOONER BECAME PREG THAN WANTED" ,
                    "EG-9 RIGHT BEFORE PREG, WANT TO HAVE BABY AT ANY TIME IN FUTURE? (2ND ASKING)" ,
                    "BD-4 IS BABY LOW BIRTHWEIGHT- 1ST BABY FROM THIS PREGNANCY" ,
                    "EG-12A RIGHT BEFORE PREG, WANT TO HAVE BABY WITH THAT PARTNER?" ,
                    "BH-5 AGE (MOS/WKS/DAY) WHEN STOPPED BREASTFEEDING - 1ST FROM THIS PREG" ,
                    "EG-4 METHOD(S) USING WHEN BECAME PREG - 1ST MENTION" ,
                   "BH-3 UNITS (MOS/WKS/DAYS) FOR FRSTEATD_N - 1ST FROM THIS PREG" ,
                   "EG-2 BEFORE YOU BECAME PREG, STOP USING ALL METHODS?" ,
                    "BH-5 UNITS (MOS/WKS/DAYS) FOR AGEQTNUR_N - 1ST FROM THIS PREG" ,
                    "GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN WEEKS)" ,
                   "OPEN INTERVAL",
                   "EG-20 WHEN DID R TELL FATHER OF PREG ABOUT PREGNANCY",
                   "BH-3 AGE (MOS/WKS/DAY) WHEN 1ST SUPPLEMENTED - 1ST FROM THIS PREG",
                   "AGE (IN MOS) WHEN R'STOPPED NURSING CHILD - 1ST FROM THIS PREG",
                   "BC-3 WAS THIS A MULTIPLE BIRTH"
                  ]

#nbrnaliv : 0 - NA, 1=1,2=2,3-6 = 3-6, 9=na"BC-2 NUMBER OF BABIES BORN ALIVE FROM THIS PREGNANCY" 
#birthord : 
#fmarital  : "FORMAL MARITAL STATUS AT PREGNANCY OUTCOME" : [1. Married, 2. Divorced, 3. Widowed, 4. Separated, 5 Never Married]
#agecon : "AGE AT TIME OF CONCEPTION" : # / 100 = years old at conception
#religion : "Current religious affiliation " : 1. NO Religion 2. Catholic 3. Protestant 4. Other Religion
#metro : "PLACE OF RESIDENCE (METROPOLITAN / NONMETROPOLITAN)"  - MSA CENTRAL CITY, MSA OTHER, NOT MSA
#wantresp 1 LATER, OVERDUE, 2 RIGHT TIME, 3 TOO SOON, MISTIMED 4 DIDN'T CARE, INDIFFERENT, 5 UNWANTED 6 DON'T KNOW, NOT SURE
#laborfor : 1. Full time, 2. part time, 3. temp, 4 working maternity leave, 5, not working but looking, 6. school.
#           7. keeping house, 9. caring for family, 9. Other
#race: 1. black, 2. white, 3. other
#hieduc: 5. 9th grade, 6. 10th, 7. 11th, 8. 12th no Diploma, 9. High School, 10. Some College, 11. assoc degree, 12. bachelor 13 masters 14 phd, 15 professional
#hispanic : 1. Hispaning 2. non-hispanic
#poverty : # / 100 percaentage of poverty level
#multbrth : 0 = Inapplicable 1 = yes 5 = no. <--- Not relevant same as NBRNALIV > 1



preg_test = {}
preg_cols = []
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

for preg_key,preg_value in  cors.items():
    if preg_key in preg_dict.keys():
        loop_exclude = False
        for exclude in exclude_strings:
            if exclude in preg_dict[preg_key]:
                loop_exclude = True
        
        if not loop_exclude:
            #print(f"{round(preg_value,4)} {preg_key} : {preg_dict[preg_key]} ")
            preg_test[preg_key] = preg_dict[preg_key]
            preg_cols.append(preg_key)

#//*** Interesting and statistically significant.
#//*** We cannot justify asking about poverty level and how many babies were born alive
#del preg_test["poverty"]
#del preg_test["nbrnaliv"]

print(f"These are our initial variables for regresion they meet the criteria of having a correlation of greater than .01\nAnd would likely be known (or assumed) by a co-worker.")
for key,value in preg_test.items():
    print(f"{key} : {value}")

  c /= stddev[:, None]
  c /= stddev[None, :]


These are our initial variables for regresion they meet the criteria of having a correlation of greater than .01
And would likely be known (or assumed) by a co-worker.
nbrnaliv : "BC-2 NUMBER OF BABIES BORN ALIVE FROM THIS PREGNANCY"
birthord : "BIRTH ORDER"
feelinpg : "EG-13 HAPPINESS TO BE PREG. SCALE (1-10)"
getprena : "BE-6 ANY PRENATAL CARE FOR THIS PREGNANCY"
fmarital : "FORMAL MARITAL STATUS"
workpreg : "BF-1 R WORKED AT ALL DURING THIS PREGNANCY"
laborfor : "LABOR FORCE STATUS"
agecon : "AGE AT TIME OF CONCEPTION"
religion : "CURRENT RELIGIOUS AFFILIATION"
metro : "PLACE OF RESIDENCE (METROPOLITAN / NONMETROPOLITAN)"
timingok : "EG-10 BECOME PREG TOO SOON, RIGHT TIME, OR LATER THAN YOU WANTED?"
race : "RACE"
hieduc : "HIGHEST COMPLETED YEAR OF SCHOOL OR DEGREE"
hispanic : "HISPANIC ORIGIN"
poverty : "POVERTY LEVEL INCOME"


In [6]:
def evaluate_variable_values_model(df,dependent_col, explanatory_col, **input_dict):
    categoricalFound = False
    _type = ""
    ignorevals = []
    
    for key,value in input_dict.items():
        if key == '_type':
            categoricalFound = True
            _type = value
        if key == 'ignorevals':
            ignorevals = value
            
    if categoricalFound == False:
        print("You must specify a categorical value as _type = quantitative/categorical")
    
    print(explanatory_col)
    unique_values = np.sort( df[ explanatory_col].unique() )
        
    print(unique_values)
    
    #//*** Build a regression model for each value
    for x in unique_values:
        
        if x not in ignorevals:
            if _type == 'quantitative':
                #valid_explanatory_value = df[ df [explanatory_col] >= x ][explanatory_col]
                
                patsy = f"{dependent_col}~{explanatory_col} > {x}"
                print(f"{patsy}")
                rsquared,pval = qmodel_patsy_ols(df,patsy,summary=False,pvalue=False,getrsquared=True,getpvalue=True)
                print(f"{x} > r2: {rsquared}  p:  {pval.values[0]}")

            if _type == 'categorical':
                patsy = f"{dependent_col}~C({explanatory_col})"
                print(f"{patsy}")
                rsquared,pval = qmodel_patsy_ols(df,patsy,summary=False,pvalue=False,getrsquared=True,getpvalue=True)
                print(f"{x} > r2: {rsquared}  p:  {pval.values[0]}")
                
            
                


In [18]:
#//*** Quick regression model results using patsy formulas. Should probably standardize on patsy for it's flexibility
def qmodel_patsy_ols(df,formula,**input_dict):
    display_summary = False
    display_pval = False
    getpvalue = False
    getrsquared = False
    
    for key,value in input_dict.items():
        if key == 'summary':
            display_summary = value
        if key == 'pvalue':
            display_pval = value
        if key == 'getpvalue':
            getpvalue = value
        if key == 'getrsquared':
            getrsquared = value
        
    #print(formula)
    #explanatory = sm.add_constant(explanatory) # adding a constant
    #model = sm.OLS(dependent, explanatory).fit()
    #predictions = model.predict(explanatory) 
    # NB. unlike sm.OLS, there is "intercept" term is included here
    #smf.ols(formula="cash_flow ~ debt_ratio + C(industry)", data=df).fit()
    model = smf.ols(formula=formula, data=df).fit()
    
    
    output = []
    
    if display_summary:
        print("==========================")
        print("Q model Quick Display")
        print("==========================")
        print_model = model.summary()
        print(f"{print_model}")
    if display_pval:
        print("==========================")
        print("Q model P Values")
        print("==========================")
        model.pvalues.drop(['Intercept'])
        for x,y in model.pvalues.items():
            if x != 'Intercept':
                print(f"{x} : {y}")
    if getrsquared:
        output.append(model.rsquared)
    
    if getpvalue:
        try:
            output.append(model.pvalues.drop(index='Intercept') )
        except:
            output.append(model.pvalues.values)
    
    #//*** If more than one output variable, output a list
    if len(output) > 1:
        return output
    elif len(output) == 1:
        #//*** Single elements, just return the element
        return output[0]
        
        
        

In [19]:
def build_patsy_from_list(input_dependent,input_explanatory):
    patsy = ""
    for x in input_explanatory:
        patsy = f"{patsy}{x} + "
    return f"{input_dependent} ~ {patsy[:-3]}"
#//*** Loop through the regression model. Remove the highest pval above .05
#//*** Keep removing pvals individually and rebuild the model
#//*** Continue until all remaining values are below .05

p_preg_cols = []

signficant_cols = preg_cols.copy()
#qmodel_ols(live['prglngth'],live[ preg_cols ],summary=True,pvalue=False,getpvalue=True)

for x in range(len(preg_cols)):
    patsy = build_patsy_from_list('prglngth',signficant_cols)
    #pvals = qmodel_ols(live['prglngth'],live[ signficant_cols ],summary=False,pvalue=False,getpvalue=True)        
    pvals = qmodel_patsy_ols(live,patsy,summary=False,pvalue=False,getpvalue=True)        
    if pvals.max() > .05:
        maxVal = pvals.max()
        for index, values in pvals.items():
            if values == maxVal:
                signficant_cols.remove(index)
    else:
        break
print(signficant_cols)
#qmodel_ols(live['prglngth'],live[ temp_cols ],summary=True,pvalue=False,getpvalue=False)
#//*** Manually build attributes. Catgeorical/quantitative and values to ignore
variable_attribs = {
    'nbrnaliv' : { "_type" : 'quantitative', "ignorevals" : [5] },
    'birthord' : { "_type" : 'quantitative', "ignorevals" : [] },
    'multbrth' : { "_type" : 'quantitative', "ignorevals" : [5] }, 
    'feelinpg' : { "_type" : 'quantitative', "ignorevals" : [98,99] }, 
    'race'     : { "_type" : 'categorical', "ignorevals" : [] }, 
    'hispanic' : { "_type" : 'categorical', "ignorevals" : [] }
}
for x in signficant_cols:
    #print(f"{x} : {np.sort(live[x].unique())}")
    evaluate_variable_values_model(live,'prglngth',x,_type=variable_attribs[x]["_type"],ignorevals=variable_attribs[x]["ignorevals"])
    



['nbrnaliv', 'birthord', 'feelinpg', 'race', 'hispanic']
nbrnaliv
[0. 1. 2. 3. 4. 5.]
prglngth~nbrnaliv > 0.0
0.0 > r2: 4.933061036482833e-05  p:  0.5080212650091466
prglngth~nbrnaliv > 1.0
1.0 > r2: 0.009438615360047087  p:  4.410632390795493e-20
prglngth~nbrnaliv > 2.0
2.0 > r2: 1.5676267584252557e-05  p:  0.7090478369455608
prglngth~nbrnaliv > 3.0
3.0 > r2: 4.183426342896368e-06  p:  0.847149447936208
prglngth~nbrnaliv > 4.0
4.0 > r2: 2.3230787211669934e-06  p:  0.8857847580133991
birthord
[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]
prglngth~birthord > 1.0
1.0 > r2: 0.0010656421532341254  p:  0.0020891091939612655
prglngth~birthord > 2.0
2.0 > r2: 0.00048208291158002847  p:  0.03850323373959511
prglngth~birthord > 3.0
3.0 > r2: 0.0001934527147764431  p:  0.18990989235171432
prglngth~birthord > 4.0
4.0 > r2: 0.0004095406642246058  p:  0.05647200863881619
prglngth~birthord > 5.0
5.0 > r2: 0.0003757516552552742  p:  0.06770238313962132
prglngth~birthord > 6.0
6.0 > r2: 0.0008998339649277

Based on the modeling analysis these parameters are significant:

**'prglngth\~nbrnaliv > 1.0'** r2: 0.009438615360047087  p:  4.410632390795493e-20

**'prglngth\~birthord > 1.0'** r2: 0.0010656421532341254  p:  0.0020891091939612655

In [46]:
#//*** Look at our model so far.
patsy = "prglngth ~ nbrnaliv + birthord"
rsquared,pval = qmodel_patsy_ols(live,patsy,summary=False,pvalue=False,getrsquared=True,getpvalue=True)
print(f"{patsy} r2: {rsquared} Pvals: {pval.values}")

#//*** Base model with attributes
patsy = "prglngth ~ nbrnaliv > 1 + birthord > 1"
rsquared,pval = qmodel_patsy_ols(live,patsy,summary=False,pvalue=False,getrsquared=True,getpvalue=True)
print(f"{patsy} r2: {rsquared} Pvals: {pval.values}")


prglngth ~ nbrnaliv + birthord r2: 0.0054032362276768 Pvals: [1.11128137e-09 2.70002423e-03]
prglngth ~ nbrnaliv > 1 + birthord > 1 r2: 0.010228357038520652 Pvals: [1.48147026e-19 7.78207280e-03]


**Based on results below.**

**Remove feelinpg variable since the coefficient is quite small and when treated as a categorical, the p-values are all inconsistent.**

**Furthermore race == 2 and Hispanic == 2 appear to be siginificant**

In [58]:
#//*** Test other Values
patsy = "prglngth ~ nbrnaliv > 1+ birthord > 1 + C(race) + C(hispanic) + C(feelinpg)"
rsquared,pval = qmodel_patsy_ols(live,patsy,summary=True,pvalue=False,getrsquared=True,getpvalue=True)
print(f"{patsy}\nr2: {rsquared}\nPvals: {pval.values}\n")

Q model Quick Display
                            OLS Regression Results                            
Dep. Variable:               prglngth   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     4.902
Date:                Sat, 31 Oct 2020   Prob (F-statistic):           1.11e-10
Time:                        15:04:08   Log-Likelihood:                -18257.
No. Observations:                8884   AIC:                         3.655e+04
Df Residuals:                    8866   BIC:                         3.668e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept   

I conclude that nbrnaliv, race, hispanic, and birthord have a small statistical significance R-Squared: 0.006 when using the variables = nbrnaliv, race, hispanic, birthord to predict prglngth.

R-Squared improves to 0.012 when using variables: nbrnaliv > 1, race == 2, hispanic == 2, birthord > 1

Which means:

    nbrnaliv > 1  - We assuume there is a live birth (The office pool is ruined on a different outcome)
    
    race == 2     - When Race is Black, pregancies tend to be a bit longer
    
    hispanic == 2 - Non hispanic pregnancies tend to be a bit longer 

In [70]:
patsy = "prglngth ~ nbrnaliv + race + hispanic + birthord "
rsquared,pval = qmodel_patsy_ols(live,patsy,summary=True,pvalue=False,getrsquared=True,getpvalue=True)
#print(f"{patsy}\nr2: {rsquared}\nPvals:\n{pval}\n")

#//*** Base model with attributes
patsy = "prglngth ~ nbrnaliv > 1 + race == 2 + hispanic == 2 + birthord > 1 "
rsquared,pval = qmodel_patsy_ols(live,patsy,summary=True,pvalue=False,getrsquared=True,getpvalue=True)
#print(f"{patsy}\nr2: {rsquared}\nPvals:\n{pval}\n")


Q model Quick Display
                            OLS Regression Results                            
Dep. Variable:               prglngth   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     14.36
Date:                Sat, 31 Oct 2020   Prob (F-statistic):           1.08e-11
Time:                        15:12:36   Log-Likelihood:                -18270.
No. Observations:                8884   AIC:                         3.655e+04
Df Residuals:                    8879   BIC:                         3.659e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     39.3091      0.1

- ## Chapter 11, Exercise 2

**Exercise:** The Trivers-Willard hypothesis suggests that for many mammals the sex ratio depends on “maternal condition”; that is, factors like the mother’s age, size, health, and social status. See https://en.wikipedia.org/wiki/Trivers-Willard_hypothesis

Some studies have shown this effect among humans, but results are mixed. In this chapter we tested some variables related to these factors, but didn’t find any with a statistically significant effect on sex ratio.

As an exercise, use a data mining approach to test the other variables in the pregnancy and respondent files. Can you find any factors with a substantial effect?



In [None]:
import regression
join = regression.JoinFemResp(live)

- ## Chapter 11, Exercise 3

**Exercise:** If the quantity you want to predict is a count, you can use Poisson regression, which is implemented in StatsModels with a function called `poisson`. It works the same way as `ols` and `logit`. As an exercise, let’s use it to predict how many children a woman has born; in the NSFG dataset, this variable is called `numbabes`.

Suppose you meet a woman who is 35 years old, black, and a college graduate whose annual household income exceeds $75,000. How many children would you predict she has born?



In [None]:
# //*** CODE HERE

Now we can predict the number of children for a woman who is 35 years old, black, and a college
graduate whose annual household income exceeds $75,000

In [None]:
#colinarity_cor = [_ for _ in np.corrcoef(preg_df)]

#//*** Correlate prglngth with each column
#for x in preg_df.columns:
    for y in preg_df.columns:    
        if x != y:
            loop_val = np.corrcoef(preg_df[x],preg_df[y])[0][1]
            
            if abs(loop_val) >.05:
                print(f"{x} : {y} - {loop_val}")
                print(f"{variance_inflation_factor(preg_df[x],preg_df[y])}")
                
        #if math.isnan(loop_val) == False:
            #//*** Ignore any values under .01
        #    if abs(loop_val) > .01:
        #        cor_val.append(loop_val)
        #        index.append(col)



In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from patsy import dmatrices

#find design matrix for linear regression model using 'rating' as response variable 
#y, X = dmatrices('rating ~ points+assists+rebounds', data=df, return_type='dataframe')

#calculate VIF for each explanatory variable
#vif = pd.DataFrame()
#vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
#vif['variable'] = X.columns

#//*** Build sub dataframe of just the candidates.

temp_list = [  x for x in preg_test.keys() ]
temp_list.insert(0,'prglngth')

preg_df = live[ temp_list ]

print(f"{preg_df}")

patsy_explanatory = ""

for key in preg_test.keys():
    patsy_explanatory = patsy_explanatory + key + "+"

patsy_explanatory = patsy_explanatory[:-1]
patsy_dependent = 'prglngth'
patsy = f"{patsy_dependent}~{patsy_explanatory}"

# Find the Variance inflation factor for each value, testing for multi collinarity
#//*** Test for multicolinarity/dependence of variables.
#find design matrix for linear regression model using 'rating' as response variable 
y, X = dmatrices(patsy, data=preg_df, return_type='dataframe')
#calculate VIF for each explanatory variable
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['variable'] = X.columns

print(f"{vif}")


In [None]:
#//*** DELETE THIS...Tucking it aside for the moment
#//*** Quick regression model results
def qmodel_ols(dependent,explanatory,**input_dict):
    display_summary = False
    display_pval = False
    getpvalue = False
    getrsquared = False
    
    for key,value in input_dict.items():
        if key == 'summary':
            display_summary = value
        if key == 'pvalue':
            display_pval = value
        if key == 'getpvalue':
            getpvalue = value
        if key == 'getrsquared':
            getrsquared = value
        
    
    explanatory = sm.add_constant(explanatory) # adding a constant
    model = sm.OLS(dependent, explanatory).fit()
    predictions = model.predict(explanatory) 
    
    output = []
    
    if display_summary:
        print("==========================")
        print("Q model Quick Display")
        print("==========================")
        print_model = model.summary()
        print(f"{print_model}")
    if display_pval:
        print("==========================")
        print("Q model P Values")
        print("==========================")
        model.pvalues.drop(['const'])
        for x,y in model.pvalues.items():
            if x != 'const':
                print(f"{x} : {y}")
    if getrsquared:
        output.append(model.rsquared)
    
    if getpvalue:
        try:
            output.append(model.pvalues.drop(index='const') )
        except:
            output.append(model.pvalues)
            
    if len(output) > 1:
        return output
    elif len(output) == 1:
        return output[0]
        
        
        