In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
fpath = os.path.expanduser("~/dropbox/sts-data/outdated-format/mgh-all-features-labels.csv")
df = pd.read_csv(fpath, low_memory=False)

cohorts = {
     os.path.expanduser("~/dropbox/sts-data/outdated-format/mgh-avr-features-labels.csv"): "opavr",
     os.path.expanduser("~/dropbox/sts-data/outdated-format/mgh-others-features-labels.csv"): "opother",
}

for fpath, cohort in cohorts.items():
    print(f"Parsing {cohort} at {fpath}")
    
    df_subset = pd.read_csv(fpath, low_memory=False)
    df_subset[cohort] = 1
    
    if "long-term-afib" in df_subset:
        df_subset.drop(labels='long-term-afib', axis='columns', inplace=True)

    if 'medrecn' in df_subset:
        cols = ['medrecn', 'surgdt']
        
        # Get Boolean mask of rows in df that also exist in df_subset
        a_index = df.set_index(cols).index
        b_index = df_subset.set_index(cols).index
        mask = a_index.isin(b_index)

        # Subset df to remove the rows that exist in df_avr
        df = df[~mask]

        # Append subset to df
        df = df.append(df_subset)
        
        # Drop duplicate rows
        df.drop_duplicates(subset=cols, keep='first', inplace=True)
        
    # No MRN, so append rows and remove dups
    else:    
        df = df.append(df_subset)
        
        # Set columns used to define duplicate rows
        cols = [
                'surgdt', 'gender', 'diabetes',
                'dialysis', 'hypertn', 'cva', 'infendty', 'chrlungd', 'immsupp', 'pvd',
                'cvd', 'prcvint', 'chf', 'carshock', 'resusc',
                'classnyh', 'medinotr', 'medster', 'numdisv', 'vdstena', 'vdstenm',
                'mtopd', 'status', 'opcab', 'opvalve', 'ethnicity', 'raceblack',
                'cvdtia', 'medgp', 'pocpciin', 'pocpci', 'incidenc',
                'racecaucasian', 'raceblack', 'raceasian', 'racenativeam',
                'raceothernativepacific', 'cvdpcarsurg', 'medadp5days',
        ]
        # For duplicate rows, set the cohort type column to 1
        df.loc[df.duplicated(subset=cols, keep=False), cohort] = 1
        
        # Drop duplicate rows
        df.drop_duplicates(subset=cols, keep='first', inplace=True)
        
    # Fill remaining NaNs for that cohort (procedure type)
    df[cohort].fillna(value=0, inplace=True)

# Drop rows with NaN MRNs        
df.dropna(subset=['medrecn'], inplace=True)

# Sort and reset index
df = df.reindex(sorted(df.columns), axis=1)
df = df.reset_index(drop=True)

# Fix improperly encoded binary variables
df.loc[df['opvalve'] == 2, 'opvalve'] = 0
df.loc[df['opcab'] == 2, 'opcab'] = 0

In [None]:
fpath = os.path.expanduser("~/dropbox/sts-data/sts-mgh.csv")
df.to_csv(fpath, index=False)
print(f"Saved {fpath}")