# Stoneburner, Kurt
- ## DSC 530 - Week 11

In [1]:
def CleanData(resp):
    """Cleans respondent data.

    resp: DataFrame
    """
    resp.cmdivorcx.replace([9998, 9999], np.nan, inplace=True)

    resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int)
    resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
    resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0

    month0 = pd.to_datetime('1899-12-15')
    dates = [month0 + pd.DateOffset(months=cm) 
             for cm in resp.cmbirth]
    resp['decade'] = (pd.DatetimeIndex(dates).year - 1900) // 10

In [2]:
# //*****************************************
# //*** Build a probability mass function
# //*****************************************
# //*** Returns Series as a PMF
# //*****************************************
def build_pmf(input_series):
    output_series = input_series.copy()
    total_values = input_series.sum()
    for value,freq in output_series.items():
        #print(f"{value} {freq} {total_values} {freq/total_values}")
        output_series.loc[value] = freq/total_values
    return output_series

# //*** Build a Cumulative Distribution Function from a Probability Mass Function
# //*** Returns a Series
def build_cdf(input_series):
    # //*** If input is not panda or pd series, try to convert it
    if not isinstance(input_series,pd.core.series.Series):
        input_series = pd.Series(input_series)
        
    # //*** If input is np.Array
    output_series = input_series.copy()
    cumulative_value = 0
    for value,freq in output_series.items():
        #print(f"{value} {freq} {cumulative_value} {freq + cumulative_value}")
        cumulative_value = freq + cumulative_value
        output_series.loc[value] = cumulative_value
    return output_series

In [3]:
# //****************************************************************************************
# //*** Set Working Directory to thinkstats folder.
# //*** This pseudo-relative path call should work on all Stoneburner localized projects. 
# //****************************************************************************************

import os
import sys
workingPath = os.getcwd().replace("coding", "ThinkStats2\\code")
sys.path.insert(1, workingPath)
os.chdir(workingPath)

In [4]:
# //*** Imports and Load Data
import nsfg
import thinkstats2
import thinkplot
import first
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

resp6 = nsfg.ReadFemResp(dct_file='2002FemResp.dct',dat_file='2002FemResp.dat.gz')
resp7 = nsfg.ReadFemResp(dct_file='2006_2010_FemRespSetup.dct',dat_file='2006_2010_FemResp.dat.gz')

CleanData(resp6)
married6 = resp6[resp6.evrmarry==1]

CleanData(resp7)
married7 = resp7[resp7.evrmarry==1]

#//*** Combine 2002 and 2010 datasets
resp = pd.concat([resp6,resp7], sort=False)

**Exercise:**    In NSFG Cycles 6 and 7, the variable `cmdivorcx` contains the date of divorce for the respondent’s first marriage, if applicable, encoded in century-months.

Compute the duration of marriages that have ended in divorce, and the duration, so far, of marriages that are ongoing. Estimate the hazard and survival curve for the duration of marriage.

Use resampling to take into account sampling weights, and plot data from several resamples to visualize sampling error.

Consider dividing the respondents into groups by decade of birth, and possibly by age at first marriage.

In [47]:

#resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int)
#resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
#resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0

#//*** There are a number of coding errors with durationsofar. A few marriages and divorces are listed 
#//*** are negative lengths. Drop these rows
resp.drop(index=resp[ resp['durationsofar'] <= 0 ].index, inplace=True)
resp.drop(index=resp[ resp['duration'] <= 0 ].index, inplace=True)



#//*** Not divorced attribute generate NaN for duration of marriage. 
#//*** Replace the NaN with durationsofar. This generates a length for all marriages

resp['duration'].fillna(resp['durationsofar'], inplace=True)

#print(round(resp6[resp6.evrmarry==1][[ 'duration','notdivorced','durationsofar' ]],1))

#print(round(resp6[resp6.evrmarry==1][resp6['duration'] > 0 ],1))
#//*** Generate histogram of durations. Rounded Continuous values to tenths for better value_counting / binning
duration_hist = round(resp[resp.evrmarry==1]['duration'],1).value_counts().sort_index()

#//*** Build CDF from PMF
duration_cdf = build_cdf( build_pmf(duration_hist) )
#print(f"{duration_cdf}")

duration_survival = 1 - duration_cdf
#print(f"{duration_survival}")

print(f"{duration_survival[-1]}")

for index,value in duration_survival[:-1].items():
    #print(f"{index} : {value}")
    hazard = ( duration_survival[index] - duration_survival[(index+1)] / duration_survival[index])
    print(f"{index} : {value} : {hazard}")
    

print(f"done")


KeyError: -1.0

- ## Chapter X, Exercise X



In [6]:
# //*** CODE HERE

- ## Chapter X, Exercise X



In [7]:
# //*** CODE HERE

- ## Chapter X, Exercise X



In [8]:
# //*** CODE HERE