In [3]:
%matplotlib notebook
import pandas as pd
import numpy as np
from scipy import stats
import os

In [6]:
datapath = os.path.join('..','data','2014')
sasfile2read=os.path.join(datapath,'samchild.sas')
datfile2read=os.path.join(datapath,'samchild.dat')

In [7]:
f = open(sasfile2read, "r") 

line_starting_input = 0
line_ending_input = 0
line_number = -1

input_lines = []

for x in f:

    line_number = line_number + 1
    
    # remove carriage returns
    x = x.replace("\n","")
        
    # The input lines start at the input statement that is not part of a comment and does not contain a semi-colon
    if x.upper().find('INPUT') > -1 and x.upper().find(';') == -1 and x.upper().find('*') == -1:
        line_starting_input = line_number

    # The input statement ends with the first semi-colon after it starts
    if line_starting_input > 0 and line_ending_input == 0 and x.find(';') > -1:
        line_ending_input = line_number
        
    # If this current line of code is...
    # 1. between the start and stop of the input 
    # 2. Not a comment line  (Note that the programs we are looking at have only full-line comments with no code)
    # 3. Not a blank line
    # Then we save it as one of the code lines
    if line_starting_input > 0 and line_number >= line_starting_input and \
        (line_ending_input == 0 or line_number <= line_ending_input) and \
        x.upper().find('*') == -1 and x.replace(" ","") != '':
        input_lines.append(x)
    

print(f"Input Statement starts at line {line_starting_input}")
print(f"Input Statement ends at line {line_ending_input}")
print(f"Total Lines: {line_number}")
f.close

#print(input_lines)

for line in input_lines:
    print(f"Code Line:  {line}")
          

Input Statement starts at line 646
Input Statement ends at line 803
Total Lines: 1244
Code Line:     INPUT
Code Line:        RECTYPE       1 -   2    SRVY_YR       3 -   6
Code Line:        HHX      $    7 -  12    INTV_QRT     13 -  13
Code Line:        INTV_MON     14 -  15    FMX      $   16 -  17
Code Line:        FPX      $   18 -  19    WTIA_SC      20 -  26 .1
Code Line:        WTFA_SC      27 -  32
Code Line:        REGION       33 -  33    STRAT_P      34 -  36
Code Line:        PSU_P        37 -  38
Code Line:        SEX          39 -  39    HISPAN_I     40 -  41
Code Line:        RACERPI2     42 -  43    MRACRPI2     44 -  45
Code Line:        MRACBPI2     46 -  47    AGE_P        48 -  49
Code Line:        CSRESPNO $   50 -  51    CSRELTVP     52 -  52
Code Line:        LATEINTC     53 -  53
Code Line:        FDRN_FLG     54 -  54
Code Line:        TOTOZ_P      55 -  57    BWTGRM_P     58 -  61
Code Line:        CHGHT_TC     62 -  63    CWGHT_TC     64 -  66
Code Line:     

In [8]:
# create lists for variable names, starting position, ending position, and character flag
varnames=[]
starts=[]
ends=[]
charflags=[]
widths=[]
varcount = -1

# loop through the input lines and pull the variable information into the lists
for line in input_lines:

    # take off leading and trailing spaces
    line = line.strip()

    # The dash tells us how many variables are specified in the line of code
    # This particular codes has either 0 (count=0), 1 (count = 1), or 2 (count = 2)
    if line.count('-') > 0:
        
        templist = line.split()
       
        for word in templist:
            
            # remove any spaces or semi-colons in the word
            word = word.replace(' ','')
            word = word.replace(';','')
                        
            # Test to see if the word starts with a letter or underscore. If so, this is a variable name.
            if word[0].isalpha()==True or word[0]== "_":
                                
                # Increment the number of variables
                varcount = varcount + 1
                
                # Store the variable name with no spaces
                varnames.append(word)
                
                # Set the type as numeric. We will reset this if we see a $ before the start/end
                charflags.append(False)
                
                # Set indicator to say if we have determined the start
                found_start = False
                
            # Test to see if the word is $.  If so, change the character flag for this variable
            elif word == '$':
                charflags[varcount] = True
           
            # If this is numeric, then it is either the start or stop place.
            elif word.isnumeric()==True:
                                
                # If we have not yet found a start then this must be the start
                # Otherwise it must be the end. Set the end and caluclate the width
                if found_start == False:
                    
                    starts.append(word)
                    found_start = True                    
                    
                else:
                    ends.append(word)
                    widths.append(int(ends[varcount]) - int(starts[varcount]) + 1)


In [14]:
vlen = len(varnames)
clen = len(charflags)
slen = len(starts)
elen = len(ends)
wlen = len(widths)

print(f"Varcount should be one less than the length of the lists (accounting for 0 index) = {varcount}")
print(f"First 5 of {vlen} varnames = {varnames[0]},{varnames[1]},{varnames[2]},{varnames[3]},{varnames[4]}")
print(f"First 5 of {clen} charflags = {charflags[0]}, {charflags[1]}, {charflags[2]}, {charflags[3]}, {charflags[4]}")
print(f"First 5 of {slen} starts = {starts[0]}, {starts[1]}, {starts[2]}, {starts[3]}, {starts[4]},")
print(f"First 5 of {elen} ends = {ends[0]}, {ends[1]}, {ends[2]}, {ends[3]}, {ends[4]}")
print(f"First 5 of {wlen} widths = {widths[0]}, {widths[1]}, {widths[2]}, {widths[3]}, {widths[4]}")

Varcount should be one less than the length of the lists (accounting for 0 index) = 245
First 5 of 246 varnames = RECTYPE,SRVY_YR,HHX,INTV_QRT,INTV_MON
First 5 of 246 charflags = False, False, True, False, False
First 5 of 246 starts = 1, 3, 7, 13, 14,
First 5 of 246 ends = 2, 6, 12, 13, 15
First 5 of 246 widths = 2, 4, 6, 1, 2


In [10]:
# Use the lists we created to read the data from the flat file (fwf=fixed width file)
df_sc_2014 = pd.read_fwf(datfile2read, widths=widths, names=varnames,parse_dates=True)
df_sc_2014.head() 

Unnamed: 0,RECTYPE,SRVY_YR,HHX,INTV_QRT,INTV_MON,FMX,FPX,WTIA_SC,WTFA_SC,REGION,...,RSCL5_H5,RSCL6,CSHFLU12,CSHFLUNM,CSHFLUM1,CSHFLUY1,CSHSPFL1,CSHFLUM2,CSHFLUY2,CSHSPFL2
0,40,2014,13,1,1,1,3,49029,6039,4,...,,,2,,,,,,,
1,40,2014,20,1,2,1,4,21905,3145,4,...,,,1,1.0,9.0,2013.0,2.0,,,
2,40,2014,25,1,1,1,4,38460,5497,2,...,2.0,0.0,2,,,,,,,
3,40,2014,29,1,1,1,5,3742,592,2,...,0.0,0.0,1,1.0,10.0,2013.0,1.0,,,
4,40,2014,34,1,2,1,2,49141,6553,1,...,2.0,0.0,2,,,,,,,


In [13]:
# convert columns to string if they were specified that way in the sas code
for i in range(len(charflags)):
    if charflags[i]==True:
        df_sc_2014[varnames[i]]=df_sc_2014[varnames[i]].astype(str)
df_sc_2014.dtypes

RECTYPE       int64
SRVY_YR       int64
HHX          object
INTV_QRT      int64
INTV_MON      int64
FMX          object
FPX          object
WTIA_SC       int64
WTFA_SC       int64
REGION        int64
STRAT_P       int64
PSU_P         int64
SEX           int64
HISPAN_I      int64
RACERPI2      int64
MRACRPI2      int64
MRACBPI2      int64
AGE_P         int64
CSRESPNO     object
CSRELTVP      int64
LATEINTC      int64
FDRN_FLG      int64
TOTOZ_P       int64
BWTGRM_P      int64
CHGHT_TC    float64
CWGHT_TC    float64
BMI_SC      float64
AMR1R       float64
AODD1       float64
ADD2        float64
             ...   
CERHOS      float64
CERREA1R    float64
CERREA2R    float64
CERREA3R    float64
CERREA4R    float64
CERREA5R    float64
CERREA6R    float64
CERREA7R    float64
CERREA8R    float64
CHCHYR        int64
CHCHMOYR    float64
CHCHNOY2    float64
CHCNOYR2      int64
CSRGYR        int64
RSRGNOYR    float64
CMDLONGR      int64
RSCL2_C2    float64
RSCL2_E2    float64
RSCL3_E3    float64
