In [1]:
# Import necessary packages including the custom utility module (see utils.py)

import os
import pandas as pd
import utils

In [3]:
# Main data location
data_dir = '/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence'

# SAS file name
sas_filename = os.path.join(data_dir, 'read.seer.research.nov17.sas')

# Read SAS file and extract column information.
# For each line of the SAS file, it defines how to interpret the TXT files.
# For example, below is one of the lines from the SAS file:
#
# @ 9   REG                  $char10. /* SEER registry */
#
# This give us the following info:
# 1) Starting offset (number of characters from the beginning of a line). In this case, 9.
# 2) Width (number of characters). In this case, 10.
# 3) Short name (REG).
# 4) Long name (SEER registry).
#
# Below is one of the lines from the TXT files:
#
# 540000120000001537201 020731932   02022006C53908090380983311        9800...
#    from ^     to ^
#
# Since "REG" starts from 9th character and 10 characters long, the REG value of this line is "0000001537".

# The list "columns" below will store the 4 column info (starting offset, width, short name, long name).
columns = []

# We then read the SAS file.
with open(sas_filename, 'r') as f:
    # Read all the lines from the file (f), but throw away first 5 lines (not needed).
    lines = f.readlines()[5:]
    
    # For each line..
    for line in lines:
        # First, clean up the line. Especially the last line of the SAS file contains weird semi-colon
        # so we want to clean that up. Remove the left and right white spaces (if exists), 
        # then remove the semi-colon on the right (if exists), and remove the white spaces again (if exists).
        line = line.strip().rstrip(';').strip()
        
        # Second, "tokenize" the line into individual words.
        tokens = line.split()
        
        # For instance, if the line is something like below,
        # @ 9   REG                  $char10. /* SEER registry */
        # 0-1---2--                  3------- 4- 5--- 6------- 7-
        # ^ these are token numbers.
        
        # tokens[0] is "@" so we'll skip it. 
        # tokens[1] has the start offset. Convert it to integer and subtract 1 because 
        # Python index starts from 0 where SAS starts from 1.
        start_offset = int(tokens[1]) - 1 

        # tokens[2] has the short name. It is string already so just copy it.
        short_name = tokens[2]
        
        # tokens[3] has the width in a special format (ex. $char10) and we want to convert it into integer (10).
        # So from tokens[3], take sub-string from 5th chracter up to the last-1 character using slice operator ([5:-1]).
        width = int(tokens[3][5:-1])
        
        # Read the long name. Join tokens[5], tokens[6], ... tokens[-1] with a space.
        # So ['SEER', 'registry'] becomes 'SEER registry'.
        long_name = ' '.join(tokens[5:-1])
        
        # Store this column info
        columns.append((start_offset, width, short_name, long_name))
        

In [7]:
# Get all the TXT file names in "data_dir" directory.
# We are using our own utility function "utils.get_all_files(...)" for that.
# We are also sorting the file names, just to make it easier to debug.
txt_filenames = sorted(utils.get_all_files(data_dir, '*.TXT'))

# Take a look at first 5 file names.
txt_filenames[:5]

['/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/BREAST.TXT',
 '/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/COLRECT.TXT',
 '/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/DIGOTHR.TXT',
 '/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/FEMGEN.TXT',
 '/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence/yr1973_2015.seer9/LYMYLEUK.TXT']

In [None]:
# For each file name, convert it into a Pandas DataFrame.
for txt_filename in txt_filenames:
    print(f'Processing {txt_filename}...')
    
    # This will store the list of all the rows.
    data = []
    
    # Open up the TXT file.
    with open(txt_filename, 'r') as f:
        lines = f.readlines()
        
        # Parse out the i-th line.
        for i, line in enumerate(lines):
            # When creating our own DataFrame, each row has to be a dictionary
            # where its keys are the name of the columns.
            row = {}
            
            # List "columns" has the columns information from SAS.
            # We will load each column info and parse the row.
            for col in columns:
                # Expand this column info into individual variables.
                start_offset, width, short_name, _ = col
                # We will use "short_name" as a key/column name (and not use long_name).
                # Read the line from start_offset to start_offset + width.
                row[short_name] = line[start_offset:start_offset + width]
                
            # Append the row to the data list
            data.append(row)
            
            # Print this message for every 100000 rows.
            if (i + 1) % 100000 == 0:
                print(f'> Read {i+1} rows...')
        print(f'> Read {i+1} rows...')
        
    # Convert data (list of dictionaries) into DataFrame.
    df = pd.DataFrame(data)
    
    # Define the directory to put new CSV files.
    csv_dir = os.path.join(data_dir, 'csv')
    
    # Define the CSV filename.
    csv_filename = txt_filename.replace(data_dir, csv_dir).replace('.TXT', '.csv')
    
    # Create CSV directory if it doesn't exist.
    if not os.path.exists(os.path.dirname(csv_filename)):
        os.makedirs(os.path.dirname(csv_filename))
    
    print(f'> Writing to {csv_filename}...')
    df.to_csv(csv_filename)
    
    print(f'> Done processing {txt_filename}')
