In [19]:
%matplotlib notebook
import pandas as pd
import numpy as np
from scipy import stats
import os

from sys import argv
import urllib
import zipfile


In [16]:
# Create a funciton that will read any file for a particular year by
# 1. Reading the SAS program and searching for the input statement to get the variable names and formats
# 2. Taking infomration from the SAS file and passing them into a function to read the flat file (.dat)
def ReadFile(year,filename):
    
    print(f"year = {year}, filename = {filename}")

    datapath = os.path.join('..','data',f'{year}')
    sasfile2read=os.path.join(datapath,f'{filename}.sas')
    datfile2read=os.path.join(datapath,f'{filename}.dat')
    
    f = open(sasfile2read, "r") 

    line_starting_input = 0
    line_ending_input = 0
    line_number = -1

    input_lines = []

    for x in f:

        line_number = line_number + 1
    
        # remove carriage returns
        x = x.replace("\n","")
        
        # The input lines start at the input statement that is not part of a comment and does not contain a semi-colon
        if x.upper().find('INPUT') > -1 and x.upper().find(';') == -1 and x.upper().find('*') == -1:
            line_starting_input = line_number

        # The input statement ends with the first semi-colon after it starts
        if line_starting_input > 0 and line_ending_input == 0 and x.find(';') > -1:
            line_ending_input = line_number
        
        # If this current line of code is...
        # 1. between the start and stop of the input 
        # 2. Not a comment line  (Note that the programs we are looking at have only full-line comments with no code)
        # 3. Not a blank line
        # Then we save it as one of the code lines
        if line_starting_input > 0 and line_number >= line_starting_input and \
            (line_ending_input == 0 or line_number <= line_ending_input) and \
            x.upper().find('*') == -1 and x.replace(" ","") != '':
            input_lines.append(x)
    
    f.close
          
    # create lists for variable names, starting position, ending position, and character flag
    varnames=[]
    starts=[]
    ends=[]
    charflags=[]
    widths=[]
    varcount = -1

    # loop through the input lines and pull the variable information into the lists
    for line in input_lines:

        # take off leading and trailing spaces
        line = line.strip()

        # The dash tells us how many variables are specified in the line of code
        # This particular codes has either 0 (count=0), 1 (count = 1), or 2 (count = 2)
        if line.count('-') > 0:
        
            templist = line.split()
       
            for word in templist:
            
                # remove any spaces or semi-colons in the word
                word = word.replace(' ','')
                word = word.replace(';','')
                        
                # Test to see if the word starts with a letter or underscore. If so, this is a variable name.
                if word[0].isalpha()==True or word[0]== "_":
                                
                    # Increment the number of variables
                    varcount = varcount + 1
                
                    # Store the variable name with no spaces
                    varnames.append(word)
                
                    # Set the type as numeric. We will reset this if we see a $ before the start/end
                    charflags.append(False)
                
                    # Set indicator to say if we have determined the start
                    found_start = False
                
                # Test to see if the word is $.  If so, change the character flag for this variable
                elif word == '$':
                    charflags[varcount] = True
           
                # If this is numeric, then it is either the start or stop place.
                elif word.isnumeric()==True:
                                
                    # If we have not yet found a start then this must be the start
                    # Otherwise it must be the end. Set the end and caluclate the width
                    if found_start == False:
                    
                        starts.append(word)
                        found_start = True                    
                    
                    else:
                        ends.append(word)
                        widths.append(int(ends[varcount]) - int(starts[varcount]) + 1)

    # Use the lists we created to read the data from the flat file (fwf=fixed width file)
    dfout = pd.read_fwf(datfile2read, widths=widths, names=varnames,parse_dates=True)

    # convert columns to string if they were specified that way in the sas code
    for i in range(len(charflags)):
        if charflags[i]==True:
            dfout[varnames[i]]=dfout[varnames[i]].astype(str)
    
    # output the dataset
    return dfout

In [20]:
# Test on 2014
samchild2014=ReadFile(2014,'samchild')
samchild2014.head()

year = 2014, filename = samchild


Unnamed: 0,RECTYPE,SRVY_YR,HHX,INTV_QRT,INTV_MON,FMX,FPX,WTIA_SC,WTFA_SC,REGION,...,RSCL5_H5,RSCL6,CSHFLU12,CSHFLUNM,CSHFLUM1,CSHFLUY1,CSHSPFL1,CSHFLUM2,CSHFLUY2,CSHSPFL2
0,40,2014,13,1,1,1,3,49029,6039,4,...,,,2,,,,,,,
1,40,2014,20,1,2,1,4,21905,3145,4,...,,,1,1.0,9.0,2013.0,2.0,,,
2,40,2014,25,1,1,1,4,38460,5497,2,...,2.0,0.0,2,,,,,,,
3,40,2014,29,1,1,1,5,3742,592,2,...,0.0,0.0,1,1.0,10.0,2013.0,1.0,,,
4,40,2014,34,1,2,1,2,49141,6553,1,...,2.0,0.0,2,,,,,,,
