## ETL Pipeline

To complete our data pipeline, we will be building on SpIn #4's work. This will read the files created by each form analysis, merge, clean, and transform them for our modeling needs.

In [1]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [2]:
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:


dir_path = '/dsa/groups/casestudy2023su/team03/model_data/'

# Get all files in the directory
file_names = os.listdir(dir_path)

# Filter .csv files
csv_files = [file for file in file_names if file.endswith('.csv')]

# Initialize a list 
dfs = []

# Loop all csv files and read 
for csv_file in csv_files:
    df = pd.read_csv(os.path.join(dir_path, csv_file))
    
    #rename 'id' and 'Id' to 'ID'
    if 'Id' in df.columns:
        df = df.rename(columns={'Id': 'ID'})
    elif 'id' in df.columns:
        df = df.rename(columns={'id': 'ID'})
    
    dfs.append(df)

    # Print name and columns
    print(f"File: {csv_file}")
    print("Columns:", df.columns)
    print()

# Merge all dataframes on 'ID'
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on='ID', how='outer')



File: BMD_results.csv
Columns: Index(['B1TRD', 'B1ITD', 'B1FND', 'B1WDD', 'B1THD', 'ID', 'B1L1D', 'B1L2D',
       'B1L3D', 'B1L4D', 'B1TLD', 'B1TBD', 'B1SBD', 'B1HDD', 'B1LAD', 'B1RAD',
       'B1LRD', 'B1RRD', 'B1TSD', 'B1LSD', 'B1PED', 'B1LLD', 'B1RLD'],
      dtype='object')

File: NP_results.csv
Columns: Index(['ID', 'NPSTAFF', 'NPHIPREP', 'NPHIPSD', 'NPSEAT', 'NPLFST', 'NPSEATLT',
       'NPLEFTNO', 'NPLEFT1', 'NPLEFT6', 'NPLEFT4', 'NPLEFT5', 'NPLEFT3',
       'NPLEFT8', 'NPLEFT2', 'NPLEFT7', 'NPLEFT9', 'NPRGTNO', 'NPRIGHT1',
       'NPRIGHT6', 'NPRIGHT4', 'NPRIGHT5', 'NPRIGHT3', 'NPRIGHT8', 'NPRIGHT2',
       'NPRIGHT7', 'NPRIGHT9', 'NPRMAX', 'NPLMAX', 'NPOMAX', 'NPREASL',
       'NPREASR', 'NPREASB'],
      dtype='object')

File: MH_results.csv
Columns: Index(['ID', 'MHDIAB', 'MHDIABT', 'MHHTHY', 'MHHTHYT', 'MHLTHY', 'MHLTHYT',
       'MHOSTEO', 'MHOSTEOT', 'MHSTRK', 'MHSTRKT', 'MHPARK', 'MHPARKT', 'MHBP',
       'MHBPT', 'MHMI', 'MHMIT', 'MHANGIN', 'MHANGINT', 'MHCHF', 'MHCHFT'

In [4]:
merged_df.columns = merged_df.columns.str.upper()

In [5]:
merged_df = merged_df.drop(['ID', 'STAFFID#', 'NPSTAFF'], axis=1)

<h3> Transform Target Variable into Binary Response. </h3>

In [6]:
merged_df["FAFXNT_BIN"] = merged_df.FAFXNT.apply(lambda x: 1 if x > 0 else 0) 
merged_df = merged_df.drop(["FAFXNT","FAFXN"],axis=1)

### Cleaning Data
In order to do my feature reduction, we're going to need to perform some more carpentry.  First, let's look at what non-numeric values we have and then and then determine whether  one-hot encoding is necessary.

In [7]:
alphanumeric_features = []

for column in merged_df.columns:
    if merged_df[column].dtype == object:
        alphanumeric_features.append(column)

for feature in alphanumeric_features:
    if feature in merged_df.columns:
        print("Feature:", feature)
        print(merged_df[feature].unique())
        print()
    else:
        print("Feature", feature, "does not exist in the DataFrame.")
        print()

Feature: FFFX50
['0: NO FRACTURE BEFORE BASELINE'
 '1: FRACTURE AFTER AGE 50, BEFORE BASELINE' '2: FRACTURE BEFORE AGE 50'
 nan]

Feature: GIEDUC
['7. Some Grad' '5. Some College' '3. Some Highschool' '4. High School'
 '6. College' '8. Grad School' '2. Elementary' '1. Some Elementary']

Feature: GIERACE
['1. WHITE' '5. OTHER' '2. AFRICAN AMERICAN' '4. HISPANIC' '3. ASIAN']

Feature: GISOC
['11. Management' '43. Office and Administrative Support'
 '13. Business and Financial' '51. Production'
 '49. Installation; Maintenance and Repair' '41. Sales and Related'
 '39. Personal Care and Service'
 '29. Healthcare Practitioners and Technical'
 '21. Community and Social Service' '17. Architecture and Engineering'
 '33. Protective Service' '53. Transportation and Material Moving'
 '47. Construction and Extraction'
 '19. Life; Physical; and Social Science' '55. Military Specific'
 '15. Computer and Mathematical' '25. Education; Training and Library'
 '27. Arts; Design; Entertainment; Sports and 

In [8]:
merged_df["HOWMANYTIMES/MORECEIVETESTOSTINJ"] = merged_df["HOWMANYTIMES/MORECEIVETESTOSTINJ"].replace(["2.0", "3.0", "4.0"], 1)
merged_df["PRESCRIPTMEDFORATLEASTPASTMONTH"] = merged_df["PRESCRIPTMEDFORATLEASTPASTMONTH"].astype(int)
merged_df["TESTOSTERONEINJECTIONSATLEAST1/MO"] = merged_df["TESTOSTERONEINJECTIONSATLEAST1/MO"].astype(int)
merged_df["TAKEACETAMINOPHENATLEAST3X/WEEK"] = merged_df["TAKEACETAMINOPHENATLEAST3X/WEEK"].astype(int)
merged_df["TAKEIBUPROFINATLEAST3X/WEEK"] = merged_df["TAKEIBUPROFINATLEAST3X/WEEK"].astype(int)
merged_df["TAKENAPROXENSODIUMATLEAST3X/WEEK"] = merged_df["TAKENAPROXENSODIUMATLEAST3X/WEEK"].astype(int)
merged_df["HAVEYOUEVERTAKENOSTEOPOROSISMEDS"] = merged_df["HAVEYOUEVERTAKENOSTEOPOROSISMEDS"].astype(int)
merged_df["HOWMANYMONTHSRECEIVEDTESTOSTINJ?"] = merged_df["HOWMANYMONTHSRECEIVEDTESTOSTINJ?"].astype(float)

We don't anticipate numbers greater than 1 indicating a significant different with HOWMANYTIMES/MORECEIVETESTOSTINJ so we're normalizing to binary.

In [9]:
merged_df = merged_df.fillna(0)

### Split the data into Independent and Dependent Variables

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = merged_df.drop(["FAFXNT_BIN"],axis=1)
y = merged_df[["FAFXNT_BIN"]]

In [12]:
import re
X = pd.get_dummies(data=X,columns=["FFFX50","GIEDUC","GIERACE","GISOC"])
X = X.rename(columns = {"FFFX50_0: NO FRACTURE BEFORE BASELINE":"No Fracture", \
          "FFFX50_1: FRACTURE AFTER AGE 50, BEFORE BASELINE":"Fracture After 50",\
          "FFFX50_2: FRACTURE BEFORE AGE 50":"Fracture Before 50"})

X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [13]:
X.head()

Unnamed: 0,B1TRD,B1ITD,B1FND,B1WDD,B1THD,B1L1D,B1L2D,B1L3D,B1L4D,B1TLD,...,GISOC_37BuildingandGroundsCleaningandMaintenance,GISOC_39PersonalCareandService,GISOC_41SalesandRelated,GISOC_43OfficeandAdministrativeSupport,GISOC_45FarmingFishingandForestry,GISOC_47ConstructionandExtraction,GISOC_49InstallationMaintenanceandRepair,GISOC_51Production,GISOC_53TransportationandMaterialMoving,GISOC_55MilitarySpecific
0,0.816111,1.22431,0.867423,0.784696,1.04645,1.13532,1.17017,1.24459,1.30829,1.21914,...,0,0,0,0,0,0,0,0,0,0
1,0.879885,1.11591,0.740235,0.519868,0.977156,1.23388,1.26274,1.19406,1.19226,1.2186,...,0,0,0,0,0,0,0,0,0,0
2,0.690143,1.12294,0.691843,0.505692,0.938607,0.745454,0.88662,0.964876,0.944702,0.891893,...,0,0,0,1,0,0,0,0,0,0
3,0.854573,1.1414,0.878307,0.648614,1.01722,1.16973,1.31559,1.26198,1.22029,1.24184,...,0,0,0,0,0,0,0,0,0,0
4,0.894594,1.25773,0.919494,0.585505,1.10474,1.03648,1.22224,1.17664,1.24591,1.17382,...,0,0,0,0,0,0,0,0,0,0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.2, random_state=42)

X_test, X_validation, y_test, y_validation = train_test_split(
...     X_test, y_test, test_size=0.5, random_state=42)

In [15]:
X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)
X_validation.reset_index(inplace=True)


### Common Transformations
After initial exploratory work on the ML Models, the team agreed that certain transformation steps that were common amongst each member should be carried out on the pipeline.

#### PCA performed on BMD values

In [16]:
pca = PCA(n_components=1)
scale = StandardScaler()

In [17]:
BMD_train = X_train.filter(regex=r"B1.+D")

BMD_validation = X_validation.filter(regex=r"B1.+D")

BMD_test = X_test.filter(regex=r"B1.+D")

In [18]:
BMD_test.shape

(599, 22)

In [19]:
BMD_train_scale = scale.fit_transform(BMD_train)
BMD_val_scale = scale.transform(BMD_validation)
BMD_test_scale = scale.transform(BMD_test)

In [20]:
BMD_train_pca = pca.fit_transform(BMD_train_scale)
BMD_val_pca = pca.transform(BMD_val_scale)
BMD_test_pca = pca.transform(BMD_test_scale)

In [21]:
BMD_train_pca = pd.DataFrame(BMD_train_pca,columns=["BMD_pca"])
BMD_val_pca = pd.DataFrame(BMD_val_pca,columns=["BMD_pca"])
BMD_test_pca = pd.DataFrame(BMD_test_pca,columns=["BMD_pca"])

In [22]:
X_train = X_train.drop(BMD_train.columns,axis=1)
X_train = pd.concat([X_train,BMD_train_pca],axis=1)

X_validation = X_validation.drop(BMD_validation.columns, axis=1)
X_validation = pd.concat([X_validation,BMD_val_pca],axis=1)

X_test = X_test.drop(BMD_test.columns,axis=1)
X_test = pd.concat([X_test,BMD_test_pca],axis=1)

#### Correlation Feature Reduction

In [23]:
# Calculate correlation matrix
correlation_matrix = X_train.corr().abs()

# It's really challening to visualize this since there are so many features
#plt.figure(figsize=(10, 8))
#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
#plt.show()

# Instead, we'll list the specific feature without the visual
threshold = 0.85
high_corr_features = np.where(correlation_matrix > threshold)
high_corr_pairs = [(X_train.columns[x], X_train.columns[y], correlation_matrix.iloc[x, y]) for x, y in zip(*high_corr_features) if x != y]

for pair in high_corr_pairs:
    print(pair)

('NPHIPREP', 'NPHIPSD', 0.9546800826233092)
('NPHIPSD', 'NPHIPREP', 0.9546800826233092)
('NPSEAT', 'NPLEFTNO', 0.8714209361446218)
('NPSEAT', 'NPRGTNO', 0.8677695007218178)
('NPSEAT', 'NPREASL', 0.8765328503561449)
('NPSEAT', 'NPREASR', 0.8684483226588594)
('NPSEAT', 'NPREASB', 0.9011567849901043)
('NPLFST', 'NPSEATLT', 0.9973617385534578)
('NPSEATLT', 'NPLFST', 0.9973617385534578)
('NPLEFTNO', 'NPSEAT', 0.8714209361446218)
('NPLEFTNO', 'NPRGTNO', 0.9222626903140254)
('NPLEFTNO', 'NPREASL', 0.9310786215431177)
('NPLEFTNO', 'NPREASR', 0.8681931324061465)
('NPLEFTNO', 'NPREASB', 0.90041819398559)
('NPLEFT1', 'NPLEFT6', 0.8754366883726402)
('NPLEFT1', 'NPLEFT4', 0.9017721767695176)
('NPLEFT1', 'NPLEFT5', 0.8874409579639588)
('NPLEFT1', 'NPLEFT3', 0.9195858029881465)
('NPLEFT1', 'NPLEFT8', 0.8623028074181164)
('NPLEFT1', 'NPLEFT2', 0.9396963348492134)
('NPLEFT1', 'NPLEFT7', 0.8706300256346826)
('NPLEFT1', 'NPLEFT9', 0.8587684966579133)
('NPLEFT1', 'NPRIGHT1', 0.8639303696208263)
('NPLEFT1'

In [24]:
dropped_columns = []

for pair in high_corr_pairs:
    feature1 = pair[0]
    feature2 = pair[1]
    if (
        feature1 in X_train.columns
        and feature2 not in dropped_columns
    ):
        X_train.drop(feature2, axis=1, inplace=True)
        dropped_columns.append(feature2)

print(dropped_columns)

['NPHIPSD', 'NPLEFTNO', 'NPRGTNO', 'NPREASL', 'NPREASR', 'NPREASB', 'NPSEATLT', 'NPLEFT6', 'NPLEFT4', 'NPLEFT5', 'NPLEFT3', 'NPLEFT8', 'NPLEFT2', 'NPLEFT7', 'NPLEFT9', 'NPRIGHT1', 'NPRIGHT2', 'NPLMAX', 'NPOMAX', 'NPRIGHT4', 'NPRIGHT5', 'NPRIGHT3', 'NPRIGHT8', 'NPRIGHT7', 'NPRIGHT9', 'NPRMAX', 'MHDIABT', 'MHLTHYT', 'MHPARKT', 'MHBPT', 'MHMIT', 'MHGLAUT', 'MHSTMAGE', 'MHCCAGE', 'MHSCAGE', 'MHLCAGE', 'MHPCAGE', 'MHOCAGE', 'MHDIZTIM', 'MHFALLTM', 'MHWGTKG', 'STANDINGHEIGHT_MEAS1', 'STANDINGHEIGHT_MEAS2', 'AVGHEIGHT_CM', 'WEIGHT_KG', 'ARMSTOSTANDUPCALCVAR', 'TOCOMPLETE5STANDSARMUSEU', 'NOOFSTEPSFORTRIAL1USUALPACE', 'TIMETOCOMPLETETRIAL2USUALPACE', 'TIMETOWALK6MCALCVAR', 'TRIAL2AIDUSEDUSUALPACE', 'TOCOMPLETE6MTRIALSCALC', 'SPEEDMSBOTHTIMESAIDSUCALC', 'SPEEDINMSUSINGBESTTIMECALC', 'SPEEDMSBESTTIMEAIDSUCALC', 'TOCOMPLETE1NWTRIALCALC', 'NWTRIALSCOMPLETEDWOAIDSCALC', 'TOCOMPLETE1NWWOAIDSCALC', 'HOWOLDWHENFIRSTSTARTEDSMOKING', 'SMOKINGSTATUS', 'HOWOLDWHENSTARTEDCHEWINGSNUFF', 'TOBACCOSTATUS', 'DD

In [25]:
for column in dropped_columns:
    if column in X_validation.columns:
        X_validation.drop(column, axis=1, inplace=True)

    if column in X_test.columns:
        X_test.drop(column, axis=1, inplace=True)


In [26]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)
print("X_validation shape: ", X_validation.shape)
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)
print("y_validation shape: ", y_validation.shape)

X_train shape:  (4795, 292)
X_test shape:  (599, 292)
X_validation shape:  (600, 292)
y_train shape:  (4795, 1)
y_test shape:  (599, 1)
y_validation shape:  (600, 1)


#### SMOTE Resampling

In [27]:
# Extract column names from X_train
X_train_columns = X_train.columns.tolist()

In [28]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [29]:
X_train = pd.DataFrame(X_res, columns=X_train_columns)
y_train = pd.DataFrame(y_res, columns=['FAFXNT_BIN'])

### Now we have cleaned datasets, ready to be used for analysis for all team members

In [30]:
# exporting merged datasets as CSV for team work

file_path = "/dsa/groups/casestudy2023su/team03/Data4/"

merged_df.to_csv((file_path+"mros_merged.csv"), index=False)
#os.chmod(file_path+"mros_merged.csv", 0o777)

X_train.to_csv((file_path+"X_train.csv"), index=False)
#os.chmod(file_path+"X_train.csv", 0o777)

X_test.to_csv((file_path+"X_test.csv"), index=False)
#os.chmod(file_path+"X_test.csv", 0o777)

X_validation.to_csv((file_path+"X_validation.csv"), index=False)
#os.chmod((file_path+"X_validation.csv"),0o777)

y_train.to_csv((file_path+"y_train.csv"), index=False)
#os.chmod(file_path+"y_train.csv", 0o777)

y_test.to_csv((file_path+"y_test.csv"), index=False)
#os.chmod(file_path+"y_test.csv", 0o777)

y_validation.to_csv((file_path+"y_validation.csv"), index=False)
#os.chmod((file_path+"y_validation.csv"),0o777)