In [3]:
import numpy as np
import os
import pandas as pd
import utils

In [2]:
# Data loading and merging

data_dir = 'output'
pkl_filenames = sorted(utils.get_all_files(data_dir, 'yr*.pickle'))

dfs = []
for pkl_filename in pkl_filenames:
    # Load each colrect data
    dfs.append(pd.read_pickle(pkl_filename))
    print(f'Number of rows: {dfs[-1].shape[0]} x columns: {dfs[-1].shape[1]}')

# Concatenate dataframes (like UNION in SQL..)
df = pd.concat(dfs, axis=0)

display(df.head())

print(f'Raw input - number of rows: {df.shape[0]} x columns: {df.shape[1]}')

Number of rows: 554686 x columns: 133
Number of rows: 123092 x columns: 133
Number of rows: 344036 x columns: 133
Number of rows: 1185 x columns: 133


Unnamed: 0,ADJAJCCSTG,ADJM_6VALUE,ADJNM_6VALUE,ADJTM_6VALUE,AGE_1REC,AGE_DX,AJCC_STG,AJ_3SEER,ANNARBOR,AYASITERWHO,...,SURGSITF,TUMOR_1V,TUMOR_2V,TUMOR_3V,TYPE_FU,T_VALUE,VASINV,VSRTSADX,YEAR_DX,YR_BRTH
0,,,,,17,83,,,8,42,...,,9,9,9,2,,,9,1975,1892
1,,,,,17,80,,,8,42,...,,9,9,9,2,,,9,1977,1896
2,,,,,16,78,,,8,99,...,,9,9,9,2,,,9,1986,1908
3,,,,,16,75,20.0,20.0,8,42,...,,9,9,9,2,30.0,,9,1989,1914
4,,,,,15,70,,,8,42,...,,9,9,9,2,,,9,1973,1903


Raw input - number of rows: 1022999 x columns: 133


In [30]:
# Basic clean-up
df_cleaned = df
if 'Unnamed: 0' in df.columns:
    df_cleaned = df_cleaned.drop(columns=['Unnamed: 0'])
df_cleaned = df_cleaned.drop_duplicates(subset='PUBCSNUM')

# Sanity check
assert len(df_cleaned.PUBCSNUM.unique()) == len(df_cleaned.PUBCSNUM)

print(f'After cleaning - number of rows: {df_cleaned.shape[0]} x columns: {df_cleaned.shape[1]}')

After cleaning - number of rows: 971719 x columns: 133


In [31]:
# Select YEAR_DX >= 2005
df_cleaned['YEAR_DX'] = pd.to_numeric(df_cleaned.YEAR_DX)
df_cleaned = df_cleaned.loc[df_cleaned['YEAR_DX'] >= 2005]

print(f'After selecting (YEAR_DX >= 2005) - number of rows: {df_cleaned.shape[0]} x columns: {df_cleaned.shape[1]}')

# Remove death-indicating features
death_related = ['CODPUB', 'CODPUBKM', 'STAT_REC', 'VSRTSADX', 'ODTHCLASS']
df_cleaned = df_cleaned.drop(columns=death_related)

print(f'After removing death-related features - number of rows: {df_cleaned.shape[0]} x columns: {df_cleaned.shape[1]}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


After selecting (YEAR_DX >= 2005) - number of rows: 412948 x columns: 133
After removing death-related features - number of rows: 412948 x columns: 128


In [32]:
# Read curation and run feature selection
curation = pd.read_excel('inclusion.xlsx', sheet_name='Sheet2')

feature_names = [str(x).strip().upper() for x in curation['SAS Variable Name\xa0'].values]
feature_types = [str(x).strip() for x in curation['Type'].values]

categorical_features = [feature_names[i] for i in range(len(feature_names)) if feature_types[i] == 'categorical']

print(f'{len(categorical_features)} categorical features among total {len(feature_names)} features')

to_drop = list(set(feature_names) - set(death_related))
df_cleaned = df_cleaned[to_drop]

print('Writing curated dataframe...')
df_cleaned.to_csv('output/COLRECT_curated.csv')

display(df_cleaned.head())

38 categorical features among total 47 features
Writing curated dataframe...


Unnamed: 0,BEHO2V,REG,REPT_SRC,HISTREC,AGE_1REC,PUBCSNUM,DX_CONF,BENBORDCOUNT,PRIMSITE,MDXRECMP,...,AYASITERWHO,BEHO3V,RAC_RECY,SEX,ST_CNTY,HISTO3V,YR_BRTH,INTPRIM,ICCC3WHO,REC_NO
8,3,1502,1,0,17,7000080,1,0,C180,6,...,56,3,1,2,9009,8000,1933,1,122,1
9,3,1502,1,5,16,7000085,1,0,C199,2,...,42,3,1,1,9011,8210,1929,1,116,2
32,3,1502,1,5,18,7000321,1,0,C187,9,...,42,3,1,2,9009,8210,1926,1,116,1
44,3,1502,1,8,18,7000411,1,0,C187,7,...,42,3,1,2,9011,8480,1918,1,116,1
57,3,1502,1,5,17,7000510,1,0,C187,5,...,42,3,1,1,9001,8140,1929,1,116,2


In [33]:
# Filter NULL values

df_filtered = df_cleaned

if 'MAR_STAT' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['MAR_STAT'] != '9']
if 'RACE1V' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['RACE1V'] != '99']
if 'AGE_DX' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['AGE_DX'] != '999']
if 'SEQ_NUM' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['SEQ_NUM'] != '99']
if 'SEQ_NUM' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['SEQ_NUM'] != '88']
if 'LATERAL' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['LATERAL'] != '9']
if 'GRADE' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['GRADE'] != '9']
if 'DX_CONF' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['DX_CONF'] != '9']
if 'NO_SURG' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['NO_SURG'] == '0']
if 'AGE_1REC' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['AGE_1REC'] != '99']
if 'RAC_RECA' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['RAC_RECA'] != '9']
if 'RAC_RECY' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['RAC_RECY'] != '9']
if 'HST_STGA' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['HST_STGA'] != '9']
if 'SRV_TIME_MON' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['SRV_TIME_MON'] != '9999']
if 'MALIGCOUNT' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['MALIGCOUNT'] != '99']
if 'BENBORDCOUNT' in df_filtered.columns:
    df_filtered = df_filtered.loc[df_filtered['BENBORDCOUNT'] != '99']

to_exclude = ['YR_BRTH', 'ICCC3WHO', 'ICCC3XWHO', 'NO_SURG']
to_exclude = list(set(df_filtered.columns).intersection(set(to_exclude)))
df_filtered = df_filtered.drop(columns=to_exclude, axis=1)

print(f'After filtering - number of rows: {df_filtered.shape[0]} x columns: {df_filtered.shape[1]}')

After filtering - number of rows: 277984 x columns: 38


In [34]:
print('Writing filtered dataframe...')
filtered_filename = os.path.join(data_dir, 'COLRECT_filtered.csv')
df_filtered.to_csv(filtered_filename)

filtered_filename = os.path.join(data_dir, 'COLRECT_filtered.pickle')
df_filtered.to_pickle(filtered_filename)

Writing filtered dataframe...


In [35]:
# Features to exclude for ML

to_exclude = ['SRV_TIME_MON_FLAG']
df_filtered = df_filtered.drop(columns=to_exclude, axis=1)

In [36]:
# Convert categorical features into numerics

def category_to_int(df, column):
    return pd.concat([df, pd.get_dummies(df[column], prefix=column + '_')], axis=1)

df_converted = df_filtered
to_exclude = []
for feature in categorical_features:
    if feature in df_converted.columns:
        to_exclude.append(feature)
        df_converted = category_to_int(df_converted, feature)

df_converted = df_converted.drop(columns=to_exclude, axis=1)
display(df_converted.head())

Unnamed: 0,PUBCSNUM,BENBORDCOUNT,MDXRECMP,AGE_DX,MALIGCOUNT,SRV_TIME_MON,YEAR_DX,REC_NO,REG__0000001501,REG__0000001502,...,AYASITERWHO__52,AYASITERWHO__55,AYASITERWHO__56,AYASITERWHO__99,INTPRIM__0,INTPRIM__1,INTPRIM__9,CSSCHEMA__25,CSSCHEMA__26,CSSCHEMA__36
9,7000085,0,2,77,4,47,2007,2,0,1,...,0,0,0,0,0,1,0,0,1,0
32,7000321,0,9,86,2,39,2012,1,0,1,...,0,0,0,0,0,1,0,1,0,0
44,7000411,0,7,88,2,38,2006,1,0,1,...,0,0,0,0,0,1,0,1,0,0
57,7000510,0,5,82,3,16,2011,2,0,1,...,0,0,0,0,0,1,0,1,0,0
73,7000635,0,5,83,2,67,2010,1,0,1,...,0,0,0,0,0,1,0,1,0,0


In [37]:
# Sanity check - no NaN

nan_sum = df_converted.isna().sum()
assert nan_sum.values.sum() == 0

In [38]:
# Write output
print('Writing pivoted dataframe...')
output_filename = os.path.join(data_dir, 'COLRECT_pivoted.csv')
df_converted.to_csv(output_filename)

output_filename = os.path.join(data_dir, 'COLRECT_pivoted.pickle')
df_converted.to_pickle(output_filename)

Writing pivoted dataframe...
