## Merge Datasets

So far, all EDA analysis was performed on individual forms such as B1 (bone mineral density measures) and various V1 forms (lifestyle, clinical notes, physical activity) with endpoint target data from form FAF23 (fracture endpoint assessment). We progressed from our divide and conquer strategy of data carpentry of the forms last week by outputting our cleaned forms to CSVs in /dsa/groups/casestudy2023su/team03/model_data/. This notebook includes the code used to join those cleaned forms with our target variables. 

In [1]:
import os
import pandas as pd
import numpy as np

dir_path = '/dsa/groups/casestudy2023su/team03/model_data/'

# Get all files in the directory
file_names = os.listdir(dir_path)

# Filter .csv files
csv_files = [file for file in file_names if file.endswith('.csv')]

# Initialize a list 
dfs = []

# Loop all csv files and read 
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(dir_path, csv_file))
    
    #rename 'id' and 'Id' to 'ID'
    if 'Id' in df.columns:
        df = df.rename(columns={'Id': 'ID'})
    elif 'id' in df.columns:
        df = df.rename(columns={'id': 'ID'})
    
    dfs.append(df)

    # Print name and columns
    print(f"File: {csv_file}")
    print("Columns:", df.columns)
    print()

# Merge all dataframes on 'ID'
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on='ID', how='outer')

merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]


File: BMD_results.csv
Columns: Index(['B1TRD', 'B1ITD', 'B1FND', 'B1WDD', 'B1THD', 'ID', 'B1L1D', 'B1L2D',
       'B1L3D', 'B1L4D', 'B1TLD', 'B1TBD', 'B1SBD', 'B1HDD', 'B1LAD', 'B1RAD',
       'B1LRD', 'B1RRD', 'B1TSD', 'B1LSD', 'B1PED', 'B1LLD', 'B1RLD'],
      dtype='object')

File: NP_results.csv
Columns: Index(['ID', 'NPSTAFF', 'NPHIPREP', 'NPHIPSD', 'NPSEAT', 'NPLFST', 'NPSEATLT',
       'NPLEFTNO', 'NPLEFT1', 'NPLEFT6', 'NPLEFT4', 'NPLEFT5', 'NPLEFT3',
       'NPLEFT8', 'NPLEFT2', 'NPLEFT7', 'NPLEFT9', 'NPRGTNO', 'NPRIGHT1',
       'NPRIGHT6', 'NPRIGHT4', 'NPRIGHT5', 'NPRIGHT3', 'NPRIGHT8', 'NPRIGHT2',
       'NPRIGHT7', 'NPRIGHT9', 'NPRMAX', 'NPLMAX', 'NPOMAX', 'NPREASL',
       'NPREASR', 'NPREASB'],
      dtype='object')

File: MH_results.csv
Columns: Index(['ID', 'MHDIAB', 'MHDIABT', 'MHHTHY', 'MHHTHYT', 'MHLTHY', 'MHLTHYT',
       'MHOSTEO', 'MHOSTEOT', 'MHSTRK', 'MHSTRKT', 'MHPARK', 'MHPARKT', 'MHBP',
       'MHBPT', 'MHMI', 'MHMIT', 'MHANGIN', 'MHANGINT', 'MHCHF', 'MHCHFT'

In [2]:
merged_df.columns = merged_df.columns.str.upper()

In [3]:
merged_df = merged_df.drop(['ID', 'STAFFID#', 'NPSTAFF'], axis=1)

In [4]:
merged_df

Unnamed: 0,B1TRD,B1ITD,B1FND,B1WDD,B1THD,B1L1D,B1L2D,B1L3D,B1L4D,B1TLD,...,AVGHEIGHT_CM,BMI,HEIGHTCHANGEFROM25,WEIGHTCHANGEFROM25,FAFXN,FAFXNT,GIEDUC,GIERACE,GISOC,GIAGE1
0,0.816111,1.224310,0.867423,0.784696,1.046450,1.135320,1.17017,1.244590,1.308290,1.219140,...,172.75,29.4211,-0.03,10.68970,1.0,0.0,7. Some Grad,1. WHITE,11. Management,67.0
1,0.879885,1.115910,0.740235,0.519868,0.977156,1.233880,1.26274,1.194060,1.192260,1.218600,...,176.60,31.8717,6.28,20.02170,0.0,0.0,7. Some Grad,1. WHITE,11. Management,67.0
2,0.690143,1.122940,0.691843,0.505692,0.938607,0.745454,0.88662,0.964876,0.944702,0.891893,...,178.45,26.0956,4.43,3.72175,3.0,3.0,5. Some College,1. WHITE,43. Office and Administrative Support,72.0
3,0.854573,1.141400,0.878307,0.648614,1.017220,1.169730,1.31559,1.261980,1.220290,1.241840,...,184.65,28.5667,3.31,36.16540,1.0,1.0,7. Some Grad,1. WHITE,11. Management,65.0
4,0.894594,1.257730,0.919494,0.585505,1.104740,1.036480,1.22224,1.176640,1.245910,1.173820,...,168.75,29.1117,-18.89,17.12940,0.0,0.0,5. Some College,1. WHITE,13. Business and Financial,78.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5989,0.775957,1.222590,0.802973,0.739685,1.013280,1.231880,1.28392,1.365480,1.376750,1.319220,...,165.80,19.7165,1.84,4.30510,0.0,0.0,6. College,3. ASIAN,41. Sales and Related,72.0
5990,0.958203,1.457130,0.940432,0.790663,1.173200,1.089150,1.04624,1.018070,1.087030,1.060090,...,168.95,28.7275,1.23,2.62175,0.0,0.0,5. Some College,1. WHITE,41. Sales and Related,66.0
5991,0.721554,1.059110,0.745348,0.536287,0.901139,0.925581,1.05576,1.040200,0.907316,0.977996,...,171.90,23.6890,-4.26,6.49740,0.0,0.0,5. Some College,3. ASIAN,37. Building and Grounds Cleaning and Maintenance,68.0
5992,0.754828,1.182750,0.747340,0.565667,0.974355,1.051750,1.00766,0.994808,1.067320,1.031840,...,180.95,29.0139,1.93,34.21890,0.0,0.0,6. College,1. WHITE,11. Management,70.0


<h3> Transform Target Variables into Binary Responses. </h3>

In [5]:
merged_df["FAFXN_BIN"] = merged_df.FAFXN.apply(lambda x: 1 if x > 0 else 0) 
merged_df["FAFXNT_BIN"] = merged_df.FAFXNT.apply(lambda x: 1 if x > 0 else 0) 

### Cleaning Data
In order to do my feature reduction, we're going to need to perform some more carpentry.  First, let's look at what non-numeric values we have and then and then determine whether  one-hot encoding is necessary.

In [6]:
alphanumeric_features = []

for column in merged_df.columns:
    if merged_df[column].dtype == object:
        alphanumeric_features.append(column)

for feature in alphanumeric_features:
    if feature in merged_df.columns:
        print("Feature:", feature)
        print(merged_df[feature].unique())
        print()
    else:
        print("Feature", feature, "does not exist in the DataFrame.")
        print()

Feature: FFFX50
['0: NO FRACTURE BEFORE BASELINE'
 '1: FRACTURE AFTER AGE 50, BEFORE BASELINE' '2: FRACTURE BEFORE AGE 50'
 nan]

Feature: GIEDUC
['7. Some Grad' '5. Some College' '3. Some Highschool' '4. High School'
 '6. College' '8. Grad School' '2. Elementary' '1. Some Elementary']

Feature: GIERACE
['1. WHITE' '5. OTHER' '2. AFRICAN AMERICAN' '4. HISPANIC' '3. ASIAN']

Feature: GISOC
['11. Management' '43. Office and Administrative Support'
 '13. Business and Financial' '51. Production'
 '49. Installation; Maintenance and Repair' '41. Sales and Related'
 '39. Personal Care and Service'
 '29. Healthcare Practitioners and Technical'
 '21. Community and Social Service' '17. Architecture and Engineering'
 '33. Protective Service' '53. Transportation and Material Moving'
 '47. Construction and Extraction'
 '19. Life; Physical; and Social Science' '55. Military Specific'
 '15. Computer and Mathematical' '25. Education; Training and Library'
 '27. Arts; Design; Entertainment; Sports and 

In [7]:
merged_df["HOWMANYTIMES/MORECEIVETESTOSTINJ"] = merged_df["HOWMANYTIMES/MORECEIVETESTOSTINJ"].replace(["2.0", "3.0", "4.0"], 1)
merged_df["PRESCRIPTMEDFORATLEASTPASTMONTH"] = merged_df["PRESCRIPTMEDFORATLEASTPASTMONTH"].astype(int)
merged_df["TESTOSTERONEINJECTIONSATLEAST1/MO"] = merged_df["TESTOSTERONEINJECTIONSATLEAST1/MO"].astype(int)
merged_df["TAKEACETAMINOPHENATLEAST3X/WEEK"] = merged_df["TAKEACETAMINOPHENATLEAST3X/WEEK"].astype(int)
merged_df["TAKEIBUPROFINATLEAST3X/WEEK"] = merged_df["TAKEIBUPROFINATLEAST3X/WEEK"].astype(int)
merged_df["TAKENAPROXENSODIUMATLEAST3X/WEEK"] = merged_df["TAKENAPROXENSODIUMATLEAST3X/WEEK"].astype(int)
merged_df["HAVEYOUEVERTAKENOSTEOPOROSISMEDS"] = merged_df["HAVEYOUEVERTAKENOSTEOPOROSISMEDS"].astype(int)
merged_df["HOWMANYMONTHSRECEIVEDTESTOSTINJ?"] = merged_df["HOWMANYMONTHSRECEIVEDTESTOSTINJ?"].astype(float)

We don't anticipate numbers greater than 1 indicating a significant different with HOWMANYTIMES/MORECEIVETESTOSTINJ so we're normalizing to binary.

In [8]:
merged_df = merged_df.fillna(0)

### Split the data into Independent and Dependent Variables

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = merged_df.drop(["FAFXN_BIN","FAFXNT_BIN","FAFXN","FAFXNT"],axis=1)
y = merged_df[["FAFXN_BIN","FAFXNT_BIN","FAFXN","FAFXNT"]]

In [11]:
import re
X = pd.get_dummies(data=X,columns=["FFFX50","GIEDUC","GIERACE","GISOC"])
X = X.rename(columns = {"FFFX50_0: NO FRACTURE BEFORE BASELINE":"No Fracture", \
          "FFFX50_1: FRACTURE AFTER AGE 50, BEFORE BASELINE":"Fracture After 50",\
          "FFFX50_2: FRACTURE BEFORE AGE 50":"Fracture Before 50"})

X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [12]:
X.head()

Unnamed: 0,B1TRD,B1ITD,B1FND,B1WDD,B1THD,B1L1D,B1L2D,B1L3D,B1L4D,B1TLD,...,GISOC_37BuildingandGroundsCleaningandMaintenance,GISOC_39PersonalCareandService,GISOC_41SalesandRelated,GISOC_43OfficeandAdministrativeSupport,GISOC_45FarmingFishingandForestry,GISOC_47ConstructionandExtraction,GISOC_49InstallationMaintenanceandRepair,GISOC_51Production,GISOC_53TransportationandMaterialMoving,GISOC_55MilitarySpecific
0,0.816111,1.22431,0.867423,0.784696,1.04645,1.13532,1.17017,1.24459,1.30829,1.21914,...,0,0,0,0,0,0,0,0,0,0
1,0.879885,1.11591,0.740235,0.519868,0.977156,1.23388,1.26274,1.19406,1.19226,1.2186,...,0,0,0,0,0,0,0,0,0,0
2,0.690143,1.12294,0.691843,0.505692,0.938607,0.745454,0.88662,0.964876,0.944702,0.891893,...,0,0,0,1,0,0,0,0,0,0
3,0.854573,1.1414,0.878307,0.648614,1.01722,1.16973,1.31559,1.26198,1.22029,1.24184,...,0,0,0,0,0,0,0,0,0,0
4,0.894594,1.25773,0.919494,0.585505,1.10474,1.03648,1.22224,1.17664,1.24591,1.17382,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.2, random_state=42)

X_test, X_validation, y_test, y_validation = train_test_split(
...     X_test, y_test, test_size=0.5, random_state=42)

### Now we have cleaned datasets, ready to be used for analysis for all team members

In [14]:
# exporting merged datasets as CSV for team work

file_path = "/dsa/groups/casestudy2023su/team03/merged_data/"

merged_df.to_csv((file_path+"mros_merged.csv"), index=False)
os.chmod(file_path+"mros_merged.csv", 0o777)

X_train.to_csv((file_path+"X_train.csv"), index=False)
os.chmod(file_path+"X_train.csv", 0o777)

X_test.to_csv((file_path+"X_test.csv"), index=False)
os.chmod(file_path+"X_test.csv", 0o777)

X_validation.to_csv((file_path+"X_validation.csv"), index=False)
os.chmod((file_path+"X_validation"),0o777)

y_train.to_csv((file_path+"y_train.csv"), index=False)
os.chmod(file_path+"y_train.csv", 0o777)

y_test.to_csv((file_path+"y_test.csv"), index=False)
os.chmod(file_path+"y_test.csv", 0o777)

y_validation.to_csv((file_path+"y_validation.csv"), index=False)
os.chmod((file_path+"y_validation"),0o777)

PermissionError: [Errno 1] Operation not permitted: '/dsa/groups/casestudy2023su/team03/merged_data/mros_merged.csv'