In [None]:
import numpy as np
import os
import pandas as pd

In [None]:
# Data loading

data_dir = '/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence/csv'
subdirs = ['yr1973_2015.seer9', 'yr1992_2015.sj_la_rg_ak', 'yr2000_2015.ca_ky_lo_nj_ga', 'yr2005.lo_2nd_half']

dfs = []
for subdir in subdirs:
    csv_filename = f'/Users/junginpark/data/SEER_1973_2015_TEXTDATA/incidence/csv/{subdir}/COLRECT.csv'
    dfs.append(pd.read_csv(csv_filename, low_memory=False))
    print(f'Number of rows: {dfs[-1].shape[0]} x columns: {dfs[-1].shape[1]}')
df = pd.concat(dfs, axis=0)

display(df.head())

print(f'Raw input - number of rows: {df.shape[0]} x columns: {df.shape[1]}')

In [None]:
# Basic clean-up
df_cleaned = df.drop(columns=['Unnamed: 0'])
df_cleaned = df_cleaned.drop_duplicates(subset='PUBCSNUM')

# Sanity check
assert len(df_cleaned.PUBCSNUM.unique()) == len(df_cleaned.PUBCSNUM)

print(f'After cleaning - number of rows: {df_cleaned.shape[0]} x columns: {df_cleaned.shape[1]}')

In [None]:
# Read curation and run feature selection
curation = pd.read_excel('/Users/junginpark/data/SEER_1973_2015_TEXTDATA/inclusion.xlsx', sheet_name='Sheet2')

feature_names = [str(x).strip().upper() for x in curation['SAS Variable Name\xa0'].values]
feature_types = [str(x).strip() for x in curation['Type'].values]

categorical_features = [feature_names[i] for i in range(len(feature_names)) if feature_types[i] == 'categorical']

print(f'{len(categorical_features)} categorical features among total {len(feature_names)} features')

df_cleaned = df_cleaned[feature_names]
df_cleaned.head()

In [None]:
# Convert categorical features into numerics

def category_to_int(df, column):
    return pd.concat([df, pd.get_dummies(df[column], prefix=column + '_')], axis=1)

df_converted = df_cleaned
for feature in categorical_features:
    df_converted = category_to_int(df_converted, feature)

df_converted = df_converted.drop(columns=categorical_features, axis=1)
df_converted.head()

In [None]:
# Sanity check - no NaN

nan_sum = df_converted.isna().sum()
assert nan_sum.values.sum() == 0

In [None]:
# Write output
output_filename = os.path.join(data_dir, 'COLRECT_converted.csv')
df_converted.to_csv(output_filename)

output_filename = os.path.join(data_dir, 'COLRECT_converted.pickle')
df_converted.to_pickle(output_filename)