# Machine Learning Models

The following specifications are used in this notebook:

- Use full dataset.
- Drop less important rows.
- Optimize hyperparameters.
- Run Balanced Random Forest Classifier and Boosted Gradient.
- Run models with demographically overrepresented subjects (ages 40-45) both retained and removed.

## Retain Demographically Overrepresented Subjects

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
from pathlib import Path

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
# Load the data
file_path = "https://raw.githubusercontent.com/kwinterling/FinalProject/main/smoker_data/Resources/train_dataset.csv"
train_df = pd.read_csv(file_path)

# Drop the null columns where all values are null
train_df = train_df.dropna(axis='columns', how='all')

# Drop the null rows
train_df = train_df.dropna()


train_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0


In [3]:
test_file_path = "https://raw.githubusercontent.com/kwinterling/FinalProject/main/smoker_data/Resources/test_dataset.csv"

test_df = pd.read_csv(file_path)

test_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0


In [4]:
df = pd.concat([train_df, test_df], axis=0, join="outer")

non_smokers = df[df.smoking==0]
smokers = df[df.smoking==1]
new_df = pd.concat([non_smokers,smokers])
new_df

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0
6,40,175,90,95.0,0.9,1.0,1,1,130,88,...,39,102,16.5,1,1.0,19,22,19,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38975,30,180,75,85.0,1.5,1.2,1,1,123,71,...,67,107,16.2,1,0.8,23,24,33,0,1
38978,40,170,65,77.0,1.5,1.5,1,1,110,62,...,79,91,16.1,1,0.9,28,43,36,1,1
38981,40,170,105,124.0,0.6,0.5,1,1,141,85,...,48,138,17.1,1,0.8,24,23,35,1,1
38982,40,160,55,75.0,1.5,1.5,1,1,95,69,...,79,116,12.0,1,0.6,24,20,17,0,1


In [5]:
df.dtypes

age                      int64
height(cm)               int64
weight(kg)               int64
waist(cm)              float64
eyesight(left)         float64
eyesight(right)        float64
hearing(left)            int64
hearing(right)           int64
systolic                 int64
relaxation               int64
fasting blood sugar      int64
Cholesterol              int64
triglyceride             int64
HDL                      int64
LDL                      int64
hemoglobin             float64
Urine protein            int64
serum creatinine       float64
AST                      int64
ALT                      int64
Gtp                      int64
dental caries            int64
smoking                  int64
dtype: object

In [6]:
# Drop the non-beneficial ID columns.
# new_df = df.drop(['waist(cm)', 'height(cm)', 'weight(kg)', 'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'Urine protein', 'dental caries', 'relaxation', 'AST', 'age', 'systolic'], axis=1)
# df = df[(df["age"] < 40) | (df["age"] > 45)]
new_df = df.drop(['eyesight(left)', 'height(cm)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'Urine protein', 'dental caries', 'relaxation', 'AST', 'age', 'systolic'], axis=1)
# new_df = df
new_df.head()

Unnamed: 0,weight(kg),waist(cm),fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,serum creatinine,ALT,Gtp,smoking
0,85,97.0,97,239,153,70,142,19.8,1.0,115,125,1
1,110,110.0,88,211,128,71,114,15.9,1.1,25,30,0
2,65,86.0,80,193,120,57,112,13.7,0.6,1400,276,0
3,80,94.0,249,210,366,46,91,16.9,0.9,36,36,0
4,60,81.0,100,179,200,47,92,14.9,1.2,28,15,0


In [7]:
X = new_df.drop(columns="smoking")
y = pd.DataFrame(new_df["smoking"])

In [8]:
X.describe()

Unnamed: 0,weight(kg),waist(cm),fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,serum creatinine,ALT,Gtp
count,77968.0,77968.0,77968.0,77968.0,77968.0,77968.0,77968.0,77968.0,77968.0,77968.0,77968.0
mean,65.938718,82.062115,99.342269,196.883491,126.749461,57.293146,115.081495,14.624264,0.88603,27.145188,39.905038
std,12.896498,9.326738,20.642609,36.353711,71.802682,14.617729,42.882888,1.566518,0.22062,31.309744,49.693524
min,30.0,51.0,46.0,55.0,8.0,4.0,1.0,4.9,0.1,1.0,2.0
25%,55.0,76.0,89.0,172.0,74.0,47.0,91.0,13.6,0.8,15.0,17.0
50%,65.0,82.0,96.0,195.0,108.0,55.0,113.0,14.8,0.9,21.0,26.0
75%,75.0,88.0,104.0,219.0,160.0,66.0,136.0,15.8,1.0,31.0,44.0
max,135.0,129.0,423.0,445.0,999.0,359.0,1860.0,21.1,11.6,2914.0,999.0


In [9]:
y['smoking'].value_counts()

0    49332
1    28636
Name: smoking, dtype: int64

In [10]:
from collections import Counter
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train.shape

(58476, 11)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(58476, 11)
(19492, 11)
(58476, 1)
(19492, 1)


In [12]:
Counter(y_train)

Counter({'smoking': 1})

In [13]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification

In [14]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=200, random_state=1)
brf

BalancedRandomForestClassifier(n_estimators=200, random_state=1)

In [15]:
brf.fit(X_train_scaled, y_train.values.ravel())
y_pred = brf.predict(X_test_scaled)

In [16]:
# rf = BalancedRandomForestClassifier()
#parameters = {
#    "n_estimators": [5, 20, 40, 80, 100, 200],
#    "max_depth": [2, 4, 8, 16, 32, None]
# }

# cv = GridSearchCV(rf, parameters, cv=5)
# cv.fit(X_train.values, y_train.values.ravel())

GridSearchCV(cv=5, estimator=BalancedRandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 8, 16, 32, None],
                         'n_estimators': [5, 20, 40, 80, 100, 200]})

In [17]:
# print(cv.best_params_)

{'max_depth': None, 'n_estimators': 200}


In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brf.predict(X_test_scaled)
y_pred

array([0, 1, 0, ..., 0, 0, 1])

In [17]:
balanced_accuracy_score(y_test, y_pred)

0.9444427050151829

In [18]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[11510,   850],
       [  302,  6830]])

In [19]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.93      0.96      0.95      0.94      0.89     12360
          1       0.89      0.96      0.93      0.92      0.94      0.89      7132

avg / total       0.94      0.94      0.95      0.94      0.94      0.89     19492



In [20]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
importances
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.18652654544757502, 'hemoglobin'),
 (0.13038609471553217, 'Gtp'),
 (0.09488082489734834, 'triglyceride'),
 (0.07889326549937489, 'waist(cm)'),
 (0.0777369090245928, 'Cholesterol'),
 (0.0777030952192227, 'LDL'),
 (0.07477504968794989, 'ALT'),
 (0.07359169060601527, 'HDL'),
 (0.07287396197601541, 'fasting blood sugar'),
 (0.06925633191009649, 'weight(kg)'),
 (0.06337623101627705, 'serum creatinine')]

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

xgb = GradientBoostingClassifier()

xgb.fit(X_train_scaled, y_train.values.ravel())
y_pd = xgb.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pd)


0.7099789997948986

In [22]:
confusion_matrix(y_test, y_pd)

array([[9908, 2452],
       [2722, 4410]])

In [23]:
print(classification_report_imbalanced(y_test, y_pd))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.78      0.80      0.62      0.79      0.70      0.50     12360
          1       0.64      0.62      0.80      0.63      0.70      0.49      7132

avg / total       0.73      0.73      0.69      0.73      0.70      0.50     19492



## Drop Demographically Overrepresented Subjects

In [24]:
# Load the data
file_path = "https://raw.githubusercontent.com/kwinterling/FinalProject/main/smoker_data/Resources/train_dataset.csv"
train_df = pd.read_csv(file_path)

# Drop the null columns where all values are null
train_df = train_df.dropna(axis='columns', how='all')

# Drop the null rows
train_df = train_df.dropna()


train_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0


In [25]:
df = df[(df["age"] < 40) | (df["age"] > 45)]

new_df = df.drop(['eyesight(left)', 'height(cm)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'Urine protein', 'dental caries', 'relaxation', 'AST', 'age', 'systolic'], axis=1)
# new_df = df
new_df.head()

Unnamed: 0,weight(kg),waist(cm),fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,serum creatinine,ALT,Gtp,smoking
0,85,97.0,97,239,153,70,142,19.8,1.0,115,125,1
1,110,110.0,88,211,128,71,114,15.9,1.1,25,30,0
4,60,81.0,100,179,200,47,92,14.9,1.2,28,15,0
5,50,78.0,114,177,74,98,64,13.9,1.0,23,70,1
11,50,72.0,83,135,35,59,69,12.5,0.9,12,11,0


In [26]:
X = new_df.drop(columns="smoking")
y = pd.DataFrame(new_df["smoking"])

In [27]:
X.describe()

Unnamed: 0,weight(kg),waist(cm),fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,serum creatinine,ALT,Gtp
count,46742.0,46742.0,46742.0,46742.0,46742.0,46742.0,46742.0,46742.0,46742.0,46742.0,46742.0
mean,66.797955,83.108964,100.080014,195.551538,127.303924,56.354499,114.542467,14.854033,0.906769,28.280818,41.023405
std,12.784073,8.842484,20.622048,37.318825,69.79068,14.292623,43.554985,1.371719,0.224226,33.485914,51.611985
min,30.0,51.0,46.0,77.0,8.0,4.0,1.0,6.3,0.1,1.0,2.0
25%,60.0,77.0,89.0,170.0,76.0,46.0,90.0,13.9,0.8,16.0,18.0
50%,65.0,83.0,96.0,193.0,110.0,55.0,112.0,15.0,0.9,22.0,27.0
75%,75.0,88.9,105.0,219.0,160.0,64.0,136.0,15.8,1.0,32.0,44.0
max,135.0,129.0,398.0,445.0,466.0,359.0,1860.0,20.9,10.0,2914.0,999.0


In [28]:
y['smoking'].value_counts()

0    29338
1    17404
Name: smoking, dtype: int64

In [29]:
from collections import Counter
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train.shape

(35056, 11)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(35056, 11)
(11686, 11)
(35056, 1)
(11686, 1)


In [31]:
brf = BalancedRandomForestClassifier(n_estimators=200, random_state=1)
brf

BalancedRandomForestClassifier(n_estimators=200, random_state=1)

In [32]:
brf.fit(X_train_scaled, y_train.values.ravel())

BalancedRandomForestClassifier(n_estimators=200, random_state=1)

In [33]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brf.predict(X_test_scaled)
y_pred

array([0, 1, 1, ..., 1, 1, 1])

In [34]:
balanced_accuracy_score(y_test, y_pred)

0.942499820781177

In [35]:
confusion_matrix(y_test, y_pred)

array([[6925,  478],
       [ 216, 4067]])

In [36]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.94      0.95      0.95      0.94      0.89      7403
          1       0.89      0.95      0.94      0.92      0.94      0.89      4283

avg / total       0.94      0.94      0.94      0.94      0.94      0.89     11686



In [37]:
# List the features sorted in descending order by feature importance
importances = brf.feature_importances_
importances
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.15711860848802883, 'hemoglobin'),
 (0.12505496656687518, 'Gtp'),
 (0.09674584324015802, 'triglyceride'),
 (0.0860303906956415, 'LDL'),
 (0.08510748338788936, 'Cholesterol'),
 (0.08266455717255511, 'ALT'),
 (0.08259440320834166, 'waist(cm)'),
 (0.08054060386250184, 'fasting blood sugar'),
 (0.07887391567655984, 'HDL'),
 (0.06447696001518943, 'weight(kg)'),
 (0.06079226768625938, 'serum creatinine')]

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

xgb = GradientBoostingClassifier()

xgb.fit(X_train_scaled, y_train.values.ravel())
y_pd = xgb.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pd)

0.6898596428825653

In [39]:
confusion_matrix(y_test, y_pd)

array([[6026, 1377],
       [1860, 2423]])

In [40]:
print(classification_report_imbalanced(y_test, y_pd))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.81      0.57      0.79      0.68      0.47      7403
          1       0.64      0.57      0.81      0.60      0.68      0.45      4283

avg / total       0.72      0.72      0.66      0.72      0.68      0.46     11686

