# Stoneburner, Kurt
- ## DSC 530 - Week 11

In [1]:
def CleanData(resp):
    """Cleans respondent data.

    resp: DataFrame
    """
    resp.cmdivorcx.replace([9998, 9999], np.nan, inplace=True)

    resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int)
    resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
    resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0

    month0 = pd.to_datetime('1899-12-15')
    dates = [month0 + pd.DateOffset(months=cm) 
             for cm in resp.cmbirth]
    resp['decade'] = (pd.DatetimeIndex(dates).year - 1900) // 10
    
    resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True)
    resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0
    resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0

In [2]:
# //*****************************************
# //*** Build a probability mass function
# //*****************************************
# //*** Returns Series as a PMF
# //*****************************************
def build_pmf(input_series):
    output_series = input_series.copy()
    total_values = input_series.sum()
    for value,freq in output_series.items():
        #print(f"{value} {freq} {total_values} {freq/total_values}")
        output_series.loc[value] = freq/total_values
    return output_series

# //*** Build a Cumulative Distribution Function from a Probability Mass Function
# //*** Returns a Series
def build_cdf(input_series):
    # //*** If input is not panda or pd series, try to convert it
    if not isinstance(input_series,pd.core.series.Series):
        input_series = pd.Series(input_series)
        
    # //*** If input is np.Array
    output_series = input_series.copy()
    cumulative_value = 0
    for value,freq in output_series.items():
        #print(f"{value} {freq} {cumulative_value} {freq + cumulative_value}")
        cumulative_value = freq + cumulative_value
        output_series.loc[value] = cumulative_value
    return output_series

In [3]:
# //****************************************************************************************
# //*** Set Working Directory to thinkstats folder.
# //*** This pseudo-relative path call should work on all Stoneburner localized projects. 
# //****************************************************************************************

import os
import sys
workingPath = os.getcwd().replace("coding", "ThinkStats2\\code")
sys.path.insert(1, workingPath)
os.chdir(workingPath)

In [4]:
# //*** Imports and Load Data
import nsfg
import thinkstats2
import thinkplot
import first
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

resp6 = nsfg.ReadFemResp(dct_file='2002FemResp.dct',dat_file='2002FemResp.dat.gz')
resp7 = nsfg.ReadFemResp(dct_file='2006_2010_FemRespSetup.dct',dat_file='2006_2010_FemResp.dat.gz')

CleanData(resp6)

CleanData(resp7)

#//*** Combine 2002 and 2010 datasets
resp = pd.concat([resp6,resp7], sort=False)

**Exercise:**    In NSFG Cycles 6 and 7, the variable `cmdivorcx` contains the date of divorce for the respondent’s first marriage, if applicable, encoded in century-months.

Compute the duration of marriages that have ended in divorce, and the duration, so far, of marriages that are ongoing. Estimate the hazard and survival curve for the duration of marriage.

Use resampling to take into account sampling weights, and plot data from several resamples to visualize sampling error.

Consider dividing the respondents into groups by decade of birth, and possibly by age at first marriage.

In [5]:

#resp['notdivorced'] = resp.cmdivorcx.isnull().astype(int)
#resp['duration'] = (resp.cmdivorcx - resp.cmmarrhx) / 12.0
#resp['durationsofar'] = (resp.cmintvw - resp.cmmarrhx) / 12.0

#//*** There are a number of coding errors with durationsofar. A few marriages and divorces are listed 
#//*** are negative lengths. Drop these rows
resp.drop(index=resp[ resp['durationsofar'] <= 0 ].index, inplace=True)
resp.drop(index=resp[ resp['duration'] <= 0 ].index, inplace=True)



#//*** Not divorced attribute generate NaN for duration of marriage. 
#//*** Replace the NaN with durationsofar. This generates a length for all marriages

resp['duration'].fillna(resp['durationsofar'], inplace=True)

#//*** Generate histogram of durations. Rounded Continuous values to tenths for better value_counting / binning
#duration_hist = round(resp[resp.evrmarry==1]['duration'],1).value_counts().sort_index()

#//*** Build CDF from PMF
#duration_cdf = build_cdf( build_pmf(duration_hist) )

#//*** Build the Compimentary Survival Values
#duration_survival = 1 - duration_cdf

#for key,value in duration_survival.items():
#    print(f"{key} : {value}")
    
#hazard_all_subjects = build_hazard_function(duration_survival)

#print(hazard_all_subjects)




In [6]:
#//*** Build hazard function from a cdf
#//*** Hazard Function: (1-CDF(x)) - (1-CDF(x+1)) / (1-CDF(x))
#//*** Difference between a value and the next value (ie the difference in two values) divided by the first value
#//*** This is a percentage of difference between sequential survivor function (1-CDF[x]) values
#//**** Returns a dictionary of hazard values
def build_hazard_function(sf):
    #//*** Convert cdf to survival function
    #sf = 1 - pd.Series(input_cdf)
    
    #//*** output dictionary
    out_dict= {}
    
    for index,value in enumerate(sf):
        #//*** Skip the last value since it generates an out of array error.
        #//*** Had troubles parsing a series len(-1) so did it this way
        if index < len(sf)-1:
            if sf.iloc[index] != 0:
                #//*** Value = Hazard Value
                out_dict[value] = ( sf.iloc[index] - sf.iloc[index+1] / sf.iloc[index] )
            else:
                out_dict[value] = 0
    return out_dict


In [34]:
#//*** Build a Kaplan Meier Survival Curve to estimate a survival function
#//*** For an estimated/predicted lifetime

def build_kaplan_meier_survival_curve(input_complete, input_ongoing):
    #//*** Build Histogram Dictionary of Complete and ongoing
    #//*** No reason to import a Counter Library for Basic stuff
    #//*** Usual histogram method of value_counts and sort_index, combined with a for items loop
    #//*** to convert to a dictionary
    #from collections import Counter    
    #hist_complete_counter = Counter(input_complete)
    #hist_ongoing_counter =  Counter(input_ongoing)
    hist_combined_list = []    

    hist_complete = {}
    for index, value in input_complete.value_counts().sort_index().items():
        #//*** Add Unique value to the combined list
        if index not in hist_complete.keys():
            hist_combined_list.append(index)

        #//*** Add to Dictionary
        hist_complete[index] = value

    hist_ongoing = {}
    for index, value in input_ongoing.value_counts().sort_index().items():
        #//*** Add Unique value to the combined list
        if index not in hist_complete.keys():
            hist_combined_list.append(index)

        #//*** Add to Dictionary
        hist_ongoing[index] = value

    #hist_combined_list = list(hist_complete_counter | hist_ongoing_counter)
    hist_combined_list.sort()
    at_risk = len(input_complete) + len(input_ongoing)
    

    #//*** Create an empty Survival curve using the hist_comibined_list as a key
    #//*** All values will be represented in the Survival Curve.
    #//*** If something is missed it will be represented as a NaN
    survival_curve = pd.Series(index=hist_combined_list,dtype='float')

    #//*** Go through each value in the ongoing and complete dictionaries
    #//*** Get the count of each, or zero if not found
    for x in hist_combined_list:
        

        #ended = hist_complete_counter[x]
        #censored = hist_ongoing_counter[x]
        
        if x in hist_complete.keys():
            ended = hist_complete[x]
        else:
            ended = 0

        if x in hist_ongoing.keys():
            censored = hist_ongoing[x]
        else:
            censored = 0

        #//*** Calculate the percentage of ended vs the remaining at risk
        survival_curve[x] = ended / at_risk

        
        #//*** Reduced the at_risk total by the totals found
        at_risk -= ended + censored

    return survival_curve



In [30]:
def EstimateHazardFunction(complete, ongoing, label='', verbose=False):
    from collections import Counter    
    """Estimates the hazard function by Kaplan-Meier.

    http://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator

    complete: list of complete lifetimes
    ongoing: list of ongoing lifetimes
    label: string
    verbose: whether to display intermediate results
    """
    if np.sum(np.isnan(complete)):
        raise ValueError("complete contains NaNs")
    if np.sum(np.isnan(ongoing)):
        raise ValueError("ongoing contains NaNs")
    
    hist_complete = Counter(complete)
    hist_ongoing = Counter(ongoing)

    ts = list(hist_complete | hist_ongoing)
    ts.sort()

    at_risk = len(complete) + len(ongoing)
    

    lams = pd.Series(index=ts,dtype='float')
    for t in ts:
        ended = hist_complete[t]
        censored = hist_ongoing[t]

        lams[t] = ended / at_risk
        if verbose:
            print(t, at_risk, ended, censored, lams[t])

        at_risk -= ended + censored
        


    return lams

In [35]:
#//*** Estimate the hazard function by Kaplan-Meier
#//*** Generates Risk for each specific Value. 
#//**** Round Ages 
complete = round(resp[resp.evrmarry==1]['agemarry'].dropna(),1)
ongoing = round(resp[resp.evrmarry==0]['age'],1)


survival_curve = build_kaplan_meier_survival_curve(complete,ongoing)
book_curve = EstimateHazardFunction(complete,ongoing)
print(f"{survival_curve.sum()}")
print(f"{book_curve.sum()}")
#kaplan_meier_hazard_function = build_hazard_function(survival_curve)

#print(kaplan_meier_hazard_function)


1.7593503947287112
1.7593503947287112


- ## Chapter X, Exercise X



In [9]:
# //*** CODE HERE

- ## Chapter X, Exercise X



In [10]:
# //*** CODE HERE

- ## Chapter X, Exercise X



In [11]:
# //*** CODE HERE