# Predicting Osteoporosis Pipeline

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import pickle

### Loading, exploration, cleaning, and preparation

In [2]:
# loading the dataset
df = pd.read_csv('osteoporosis_data.csv')

In [3]:
# looking at the size of the dataset
df.shape

(1566, 16)

In [4]:
# looking at first few records
df.head()

Unnamed: 0,Id,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
0,1734616,69,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,Yes,Moderate,Rheumatoid Arthritis,Corticosteroids,Yes,1
1,1419098,32,Female,Normal,Yes,Asian,Underweight,Low,Sufficient,Sedentary,No,,,,Yes,1
2,1797916,89,Female,Postmenopausal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,Moderate,Hyperthyroidism,Corticosteroids,No,1
3,1805337,78,Female,Normal,No,Caucasian,Underweight,Adequate,Insufficient,Sedentary,Yes,,Rheumatoid Arthritis,Corticosteroids,No,1
4,1799320,41,Male,Normal,Yes,Caucasian,Normal,Low,Sufficient,Active,Yes,Moderate,Rheumatoid Arthritis,Corticosteroids,Yes,1


In [5]:
# looking at random sample of 10 records
df.sample(10)

Unnamed: 0,Id,Age,Gender,Hormonal Changes,Family History,Race/Ethnicity,Body Weight,Calcium Intake,Vitamin D Intake,Physical Activity,Smoking,Alcohol Consumption,Medical Conditions,Medications,Prior Fractures,Osteoporosis
505,1104327,68,Male,Normal,No,Caucasian,Normal,Adequate,Sufficient,Active,No,,Hyperthyroidism,,Yes,1
815,1571488,18,Male,Normal,No,Caucasian,Underweight,Low,Sufficient,Sedentary,Yes,Moderate,Rheumatoid Arthritis,,No,0
585,1482002,76,Male,Normal,Yes,Asian,Normal,Low,Insufficient,Sedentary,No,,Rheumatoid Arthritis,,No,1
278,1376473,57,Male,Normal,Yes,African American,Normal,Adequate,Sufficient,Active,No,,Rheumatoid Arthritis,,No,1
49,1986851,35,Male,Postmenopausal,No,Caucasian,Underweight,Low,Insufficient,Sedentary,No,,Rheumatoid Arthritis,,No,1
80,1234987,40,Female,Postmenopausal,Yes,Caucasian,Underweight,Low,Insufficient,Active,No,Moderate,Hyperthyroidism,,Yes,1
866,1496517,20,Female,Normal,Yes,Asian,Normal,Low,Insufficient,Active,No,,,Corticosteroids,Yes,0
314,1719175,86,Female,Normal,No,Asian,Normal,Adequate,Sufficient,Active,Yes,,Hyperthyroidism,Corticosteroids,No,1
995,1650070,20,Female,Postmenopausal,Yes,African American,Normal,Low,Insufficient,Active,Yes,Moderate,,Corticosteroids,No,0
190,1904899,50,Male,Postmenopausal,Yes,African American,Normal,Adequate,Sufficient,Sedentary,No,Moderate,,Corticosteroids,No,1


In [6]:
# removing id variable
df.drop(columns=['Id'], inplace=True)

In [7]:
# looking at the numeric variables
df.describe()

Unnamed: 0,Age,Osteoporosis
count,1566.0,1566.0
mean,39.134738,0.503831
std,21.269856,0.500145
min,18.0,0.0
25%,21.0,0.0
50%,32.0,1.0
75%,53.75,1.0
max,90.0,1.0


In [8]:
# looking at data types of the variables
df.dtypes

Age                     int64
Gender                 object
Hormonal Changes       object
Family History         object
Race/Ethnicity         object
Body Weight            object
Calcium Intake         object
Vitamin D Intake       object
Physical Activity      object
Smoking                object
Alcohol Consumption    object
Medical Conditions     object
Medications            object
Prior Fractures        object
Osteoporosis            int64
dtype: object

In [9]:
# checking for any NA values
df.isna().sum()

Age                      0
Gender                   0
Hormonal Changes         0
Family History           0
Race/Ethnicity           0
Body Weight              0
Calcium Intake           0
Vitamin D Intake         0
Physical Activity        0
Smoking                  0
Alcohol Consumption    781
Medical Conditions     518
Medications            772
Prior Fractures          0
Osteoporosis             0
dtype: int64

In [10]:
# replacing NA values with "None", as this is the actual meaning in the context of the variables
df['Alcohol Consumption'] = df['Alcohol Consumption'].where(pd.notna(df['Alcohol Consumption']), "None")
df['Medical Conditions'] = df['Medical Conditions'].where(pd.notna(df['Medical Conditions']), "None")
df['Medications'] = df['Medications'].where(pd.notna(df['Medications']), "None")

In [11]:
# checking NA values again
df.isna().sum()

Age                    0
Gender                 0
Hormonal Changes       0
Family History         0
Race/Ethnicity         0
Body Weight            0
Calcium Intake         0
Vitamin D Intake       0
Physical Activity      0
Smoking                0
Alcohol Consumption    0
Medical Conditions     0
Medications            0
Prior Fractures        0
Osteoporosis           0
dtype: int64

In [12]:
# looking at unique values for each variable
df.nunique()

Age                    73
Gender                  2
Hormonal Changes        2
Family History          2
Race/Ethnicity          3
Body Weight             2
Calcium Intake          2
Vitamin D Intake        2
Physical Activity       2
Smoking                 2
Alcohol Consumption     2
Medical Conditions      3
Medications             2
Prior Fractures         2
Osteoporosis            2
dtype: int64

In [13]:
# looking at the unique values of categorical variables
df['Hormonal Changes'].unique()

array(['Normal', 'Postmenopausal'], dtype=object)

In [14]:
df['Family History'].unique()

array(['Yes', 'No'], dtype=object)

In [15]:
df['Race/Ethnicity'].unique()

array(['Asian', 'Caucasian', 'African American'], dtype=object)

In [16]:
df['Body Weight'].unique()

array(['Underweight', 'Normal'], dtype=object)

In [17]:
df['Calcium Intake'].unique()

array(['Low', 'Adequate'], dtype=object)

In [18]:
df['Vitamin D Intake'].unique()

array(['Sufficient', 'Insufficient'], dtype=object)

In [19]:
df['Physical Activity'].unique()

array(['Sedentary', 'Active'], dtype=object)

In [20]:
df['Smoking'].unique()

array(['Yes', 'No'], dtype=object)

In [21]:
df['Alcohol Consumption'].unique()

array(['Moderate', 'None'], dtype=object)

In [22]:
df['Medical Conditions'].unique()

array(['Rheumatoid Arthritis', 'None', 'Hyperthyroidism'], dtype=object)

In [23]:
df['Medications'].unique()

array(['Corticosteroids', 'None'], dtype=object)

In [24]:
df['Prior Fractures'].unique()

array(['Yes', 'No'], dtype=object)

In [25]:
# creating feature set by dropping the outcome variable
X = df.drop('Osteoporosis', axis='columns')
# copying the outcome variable to y
y = df.Osteoporosis

In [26]:
# splitting the training data into 70%-30% splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify=y)

### Building the ML pipeline

In [27]:
# encoding all categoric variables
column_transformer = ColumnTransformer([
    ('cat', OneHotEncoder(), ['Gender',
                              'Hormonal Changes',
                              'Family History',
                              'Race/Ethnicity',
                              'Body Weight',
                              'Calcium Intake',
                              'Vitamin D Intake',
                              'Physical Activity',
                              'Smoking',
                              'Alcohol Consumption',
                              'Medical Conditions',
                              'Medications',
                              'Prior Fractures'])
])

In [28]:
# creating a pipeline using Logistic Regression
pipeline_LR = Pipeline([
    ('ct', column_transformer),
    ('clf', LogisticRegression(max_iter=1000))
])

In [29]:
# creating a pipeline using Random Forest Regressor
pipeline_RF = Pipeline([
    ('ct', column_transformer),
    ('clf', RandomForestClassifier())
])

In [30]:
# creating a pipeline using Decision Tree Regressor
pipeline_DT = Pipeline([
    ('ct', column_transformer),
    ('clf', DecisionTreeClassifier())
])

### Fitting the ML model pipeline and checking cross validation accuracy

In [31]:
# fitting the pipeline
pipeline_LR.fit(X_train, y_train)
pipeline_RF.fit(X_train, y_train)
pipeline_DT.fit(X_train, y_train)

In [32]:
scores_LR = cross_val_score(pipeline_LR, X_train, y_train, scoring='accuracy', cv=5)
scores_RF = cross_val_score(pipeline_RF, X_train, y_train, scoring='accuracy', cv=5)
scores_DT = cross_val_score(pipeline_DT, X_train, y_train, scoring='accuracy', cv=5)

In [33]:
# model training accuracy
mean_f1_LR = scores_LR.mean()
mean_f1_RF = scores_RF.mean()
mean_f1_DT = scores_DT.mean()

print("Accuracy (LR):", mean_f1_LR)
print("Accuracy (RF):", mean_f1_RF)
print("Accuracy (DT):", mean_f1_DT)

Accuracy (LR): 0.5073059360730593
Accuracy (RF): 0.4999916977999169
Accuracy (DT): 0.5026857617268575


## Attempting to improve model performance by tuning hyperparameters

In [34]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [35]:
df_encoded = df.copy(deep=True)
encoder = OneHotEncoder()
df_encoded = pd.get_dummies(df_encoded, columns=['Gender', 'Hormonal Changes', 'Family History', 'Race/Ethnicity', 'Body Weight', 'Calcium Intake', 'Vitamin D Intake', 'Physical Activity', 'Smoking', 'Alcohol Consumption', 'Medical Conditions', 'Medications', 'Prior Fractures'])

In [36]:
df_encoded.head()

Unnamed: 0,Age,Osteoporosis,Gender_Female,Gender_Male,Hormonal Changes_Normal,Hormonal Changes_Postmenopausal,Family History_No,Family History_Yes,Race/Ethnicity_African American,Race/Ethnicity_Asian,...,Smoking_Yes,Alcohol Consumption_Moderate,Alcohol Consumption_None,Medical Conditions_Hyperthyroidism,Medical Conditions_None,Medical Conditions_Rheumatoid Arthritis,Medications_Corticosteroids,Medications_None,Prior Fractures_No,Prior Fractures_Yes
0,69,1,True,False,True,False,False,True,False,True,...,True,True,False,False,False,True,True,False,False,True
1,32,1,True,False,True,False,False,True,False,True,...,False,False,True,False,True,False,False,True,False,True
2,89,1,True,False,False,True,True,False,False,False,...,False,True,False,True,False,False,True,False,True,False
3,78,1,True,False,True,False,True,False,False,False,...,True,False,True,False,False,True,True,False,True,False
4,41,1,False,True,True,False,False,True,False,False,...,True,True,False,False,False,True,True,False,False,True


In [37]:
# creating feature set by dropping the outcome variable
X_encoded = df_encoded.drop('Osteoporosis', axis='columns')
# copying the outcome variable to y
y_encoded = df_encoded.Osteoporosis

In [38]:
# splitting the training data into 70%-30% splits
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded, test_size = 0.3, stratify=y)

In [39]:
clf = DecisionTreeClassifier()

param_grid = {
    'criterion' : ["gini", "entropy"],
    'max_depth': np.arange(2,21,1)
}    

grid = GridSearchCV(estimator = clf, param_grid = param_grid, cv=3)
grid.fit(X_train_encoded, y_train_encoded)
print(f"Best parameters are: {grid.best_params_}")
print(f"The cross-validation accuracy is: {round(grid.best_score_,4)}")

# evaluation
y_predict_encoded = grid.best_estimator_.predict(X_test_encoded)
print(f"The testing accuracy is: {accuracy_score(y_test_encoded, y_predict_encoded).round(4)}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test_encoded, y_predict_encoded)
print(cm)

Best parameters are: {'criterion': 'entropy', 'max_depth': 6}
The cross-validation accuracy is: 0.8978
The testing accuracy is: 0.8851
The confusion matrix is:
[[225   8]
 [ 46 191]]


In [41]:
clf = RandomForestClassifier()

param_grid = {
    'criterion' : ["gini", "entropy"],
    'max_depth': np.arange(2,21,1)
}    

grid = GridSearchCV(estimator = clf, param_grid = param_grid, cv=3)
grid.fit(X_train_encoded, y_train_encoded)
print(f"Best parameters are: {grid.best_params_}")
print(f"The cross-validation accuracy is: {round(grid.best_score_,4)}")

# evaluation
y_predict_encoded = grid.best_estimator_.predict(X_test_encoded)
print(f"The testing accuracy is: {accuracy_score(y_test_encoded, y_predict_encoded).round(4)}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test_encoded, y_predict_encoded)
print(cm)

Best parameters are: {'criterion': 'entropy', 'max_depth': 4}
The cross-validation accuracy is: 0.8631
The testing accuracy is: 0.8468
The confusion matrix is:
[[232   1]
 [ 71 166]]


## Building new pipelines with new hyperparameters

In [42]:
# creating a pipeline using Logistic Regression
pipeline_LR_2 = Pipeline([
    ('clf', LogisticRegression(max_iter=1000))
])

In [43]:
# creating a pipeline using Random Forest Classifier
pipeline_RF_2 = Pipeline([
    ('clf', RandomForestClassifier(criterion='entropy', max_depth=4))
])

In [44]:
# creating a pipeline using Decision Tree Classifier
pipeline_DT_2 = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy', max_depth=6))
])

In [45]:
# fitting the pipeline
pipeline_LR_2.fit(X_train_encoded, y_train_encoded)
pipeline_RF_2.fit(X_train_encoded, y_train_encoded)
pipeline_DT_2.fit(X_train_encoded, y_train_encoded)

In [46]:
scores_LR_2 = cross_val_score(pipeline_LR_2, X_train_encoded, y_train_encoded, scoring='accuracy', cv=5)
scores_RF_2 = cross_val_score(pipeline_RF_2, X_train_encoded, y_train_encoded, scoring='accuracy', cv=5)
scores_DT_2 = cross_val_score(pipeline_DT_2, X_train_encoded, y_train_encoded, scoring='accuracy', cv=5)

In [47]:
# model training accuracy
mean_LR_2 = scores_LR_2.mean()
mean_RF_2 = scores_RF_2.mean()
mean_DT_2 = scores_DT_2.mean()

print("Accuracy (LR):", mean_LR_2)
print("Accuracy (RF):", mean_RF_2)
print("Accuracy (DT):", mean_DT_2)

Accuracy (LR): 0.8174968866749689
Accuracy (RF): 0.8567413864674138
Accuracy (DT): 0.8886550435865503


In [48]:
f1_scores_LR_2 = cross_val_score(pipeline_LR_2, X_train_encoded, y_train_encoded, scoring='f1', cv=5)
f1_scores_RF_2 = cross_val_score(pipeline_RF_2, X_train_encoded, y_train_encoded, scoring='f1', cv=5)
f1_scores_DT_2 = cross_val_score(pipeline_DT_2, X_train_encoded, y_train_encoded, scoring='f1', cv=5)


# model training accuracy
mean_f1_LR_2 = f1_scores_LR_2.mean()
mean_f1_RF_2 = f1_scores_RF_2.mean()
mean_f1_DT_2 = f1_scores_DT_2.mean()

print("F-1 Score (LR):", mean_f1_LR_2)
print("F-1 Score (RF):", mean_f1_RF_2)
print("F-1 Score (DT):", mean_f1_DT_2)

Accuracy (LR): 0.8100596226844583
Accuracy (RF): 0.8378918844953693
Accuracy (DT): 0.877713768115942


### Validating the model

In [49]:
# predicting the outcome for validation set
y_test_predicted_encoded = pipeline_DT_2.predict(X_test_encoded)

In [50]:
# model validation accuracy
accuracy_score(y_test_encoded, y_test_predicted_encoded)

0.8851063829787233

In [55]:
# saving the model pipeline to a file
pickle.dump(pipeline_DT_2, open('pipeline.pkl', 'wb'))

### Making sure testing data matches training/validation data

In [51]:
df_test = pd.read_csv('osteoporosis_testing.csv')

In [52]:
df_test.drop(columns=['Id'], inplace=True)
df_test['Alcohol Consumption'] = df_test['Alcohol Consumption'].where(pd.notna(df_test['Alcohol Consumption']), "None")
df_test['Medical Conditions'] = df_test['Medical Conditions'].where(pd.notna(df_test['Medical Conditions']), "None")
df_test['Medications'] = df_test['Medications'].where(pd.notna(df_test['Medications']), "None")
df_test = pd.get_dummies(df_test, columns=['Gender', 'Hormonal Changes', 'Family History', 'Race/Ethnicity', 'Body Weight', 'Calcium Intake', 'Vitamin D Intake', 'Physical Activity', 'Smoking', 'Alcohol Consumption', 'Medical Conditions', 'Medications', 'Prior Fractures'])

In [53]:
df_test.head()

Unnamed: 0,Age,Osteoporosis,Gender_Female,Gender_Male,Hormonal Changes_Normal,Hormonal Changes_Postmenopausal,Family History_No,Family History_Yes,Race/Ethnicity_African American,Race/Ethnicity_Asian,...,Smoking_Yes,Alcohol Consumption_Moderate,Alcohol Consumption_None,Medical Conditions_Hyperthyroidism,Medical Conditions_None,Medical Conditions_Rheumatoid Arthritis,Medications_Corticosteroids,Medications_None,Prior Fractures_No,Prior Fractures_Yes
0,62,1,False,True,True,False,True,False,False,True,...,True,False,True,False,False,True,True,False,False,True
1,21,1,False,True,True,False,False,True,False,True,...,True,False,True,True,False,False,True,False,True,False
2,18,0,True,False,False,True,True,False,True,False,...,False,True,False,False,False,True,True,False,False,True
3,34,1,True,False,False,True,True,False,True,False,...,False,False,True,True,False,False,False,True,False,True
4,28,1,True,False,True,False,False,True,True,False,...,True,True,False,False,True,False,True,False,True,False


In [56]:
df_test.to_csv('osteoporosis_testing.csv', index=False)

### Saving encoded training/validation data for visualizations

In [58]:
df_encoded.to_csv('osteoporosis_processed_train_val.csv', index=False)