In [39]:
# Import all libraries and dependencies
import pandas as pd
import os
import numpy as np
import scipy as scipy
import matplotlib.pyplot as plt
import matplotlib.figure as figure
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


### This section is to set defualt options for pandas and other libraries

In [40]:
# Set default options for pandas and other libraries
pd.set_option('display.max_colwidth', None)

In [41]:
# Define the column names
# column_names = [
#     'Age', 'Sex', 'HighChol', 'CholCheck', 'BMI', 
#     'Smoker', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits',
#     'Veggies', 'HvyAlcoholConsump', 'GenHlth', 'MentHlth',
#     'PhysHlth', 'DiffWalk', 'Stroke', 'HighBP','Diabetes'
# ]
# Import Excel files
behavioural_raw_df = pd.read_csv("Resources/diabetes_data.csv").dropna()

In [42]:
behavioural_raw_df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
behavioural_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   70692 non-null  float64
 1   Sex                   70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64
 4   BMI                   70692 non-null  float64
 5   Smoker                70692 non-null  float64
 6   HeartDiseaseorAttack  70692 non-null  float64
 7   PhysActivity          70692 non-null  float64
 8   Fruits                70692 non-null  float64
 9   Veggies               70692 non-null  float64
 10  HvyAlcoholConsump     70692 non-null  float64
 11  GenHlth               70692 non-null  float64
 12  MentHlth              70692 non-null  float64
 13  PhysHlth              70692 non-null  float64
 14  DiffWalk              70692 non-null  float64
 15  Stroke             

In [44]:
behavioural_raw_df.isna().sum()

Age                     0
Sex                     0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Stroke                  0
HighBP                  0
Diabetes                0
dtype: int64

In [45]:
behavioural_raw_df.dropna()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,6.0,0.0,1.0,1.0,37.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0
70688,10.0,1.0,1.0,1.0,29.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0
70689,13.0,0.0,1.0,1.0,25.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0,15.0,0.0,1.0,0.0,1.0,1.0
70690,11.0,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,1.0


In [46]:
behavioural_raw_df = behavioural_raw_df.query("Sex == 1")

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = sm.add_constant(behavioural_raw_df)
X
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif.sort_values('VIF Factor', ascending=False)


Unnamed: 0,VIF Factor,features
1,84.259328,Sex
11,1.786781,GenHlth
13,1.677714,PhysHlth
14,1.507763,DiffWalk
17,1.388025,Diabetes
16,1.333183,HighBP
0,1.32227,Age
12,1.209212,MentHlth
6,1.20147,HeartDiseaseorAttack
2,1.173283,HighChol


In [47]:
# Create the labels set `y` and features DataFrame `X`
# generate code for feature selection Model
# Get the features (everything except the "Diabetes" column)
X = behavioural_raw_df.copy().drop(columns=["Diabetes"])
# filter X to only include Sex = 1

X.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0
6,13.0,1.0,1.0,1.0,26.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# Get the target column
y = behavioural_raw_df["Diabetes"].values.ravel()
y[0:5]

array([0., 0., 0., 0., 0.])

In [60]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
n_components = 5  # Number of principal components to keep
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
n_components = 5  # Number of principal components to keep
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train_pca, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_pca)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# K-Fold Cross-Validation
k = 5  # Number of folds
kf = KFold(n_splits=k, random_state=42, shuffle=True)

# Perform K-Fold Cross-Validation
cv_scores = cross_val_score(rf_model, X_train_pca, y_train, cv=kf, scoring='accuracy')

print(f"{k}-Fold Cross-Validation Accuracy Scores:", cv_scores)
print(f"Mean {k}-Fold Cross-Validation Accuracy:", cv_scores.mean())

Accuracy: 0.71355617455896
Classification Report:
               precision    recall  f1-score   support

         0.0       0.72      0.65      0.68      3075
         1.0       0.71      0.77      0.74      3387

    accuracy                           0.71      6462
   macro avg       0.71      0.71      0.71      6462
weighted avg       0.71      0.71      0.71      6462

5-Fold Cross-Validation Accuracy Scores: [0.71096924 0.70535887 0.71638615 0.72451151 0.69891641]
Mean 5-Fold Cross-Validation Accuracy: 0.711228435535255


In [61]:
from sklearn.linear_model import LogisticRegression
# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
logistic_regression_model = LogisticRegression(random_state=42)

# Fit the model
logistic_regression_model.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {logistic_regression_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model.score(X_test_scaled, y_test)}")

# Generate predictions from the model we just fit
training_predictions = logistic_regression_model.predict(X_train_scaled)

# Convert those predictions (and actual values) to a DataFrame
training_results_df = pd.DataFrame({"Prediction": training_predictions, "Actual": y_train})

# Apply the fitted model to the `test` dataset
testing_predictions = logistic_regression_model.predict(X_test_scaled)

# Save both the test predictions and actual test values to a DataFrame
testing_results = pd.DataFrame({
    "Testing Data Predictions": testing_predictions, 
    "Testing Data Actual Targets": y_test})

Training Data Score: 0.7344838260331218
Testing Data Score: 0.7338285360569483


In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Assuming X_train_scaled, X_test_scaled, y_train, and y_test are already defined

# Logistic Regression Model
logistic_regression_model = LogisticRegression(random_state=42)

# Fit the Logistic Regression model
logistic_regression_model.fit(X_train_scaled, y_train)

# Score the Logistic Regression model
print(f"Logistic Regression Training Data Score: {logistic_regression_model.score(X_train_scaled, y_train)}")
print(f"Logistic Regression Testing Data Score: {logistic_regression_model.score(X_test_scaled, y_test)}")

# Generate predictions from the Logistic Regression model
logistic_training_predictions = logistic_regression_model.predict(X_train_scaled)
logistic_testing_predictions = logistic_regression_model.predict(X_test_scaled)

# Convert Logistic Regression predictions to DataFrames
logistic_training_results_df = pd.DataFrame({"Prediction": logistic_training_predictions, "Actual": y_train})
logistic_testing_results_df = pd.DataFrame({"Prediction": logistic_testing_predictions, "Actual": y_test})

# Random Forest Model
random_forest_model = RandomForestClassifier(random_state=42)

# Fit the Random Forest model
random_forest_model.fit(X_train_scaled, y_train)

# Score the Random Forest model
print(f"Random Forest Training Data Score: {random_forest_model.score(X_train_scaled, y_train)}")
print(f"Random Forest Testing Data Score: {random_forest_model.score(X_test_scaled, y_test)}")

# Generate predictions from the Random Forest model
rf_training_predictions = random_forest_model.predict(X_train_scaled)
rf_testing_predictions = random_forest_model.predict(X_test_scaled)

# Convert Random Forest predictions to DataFrames
rf_training_results_df = pd.DataFrame({"Prediction": rf_training_predictions, "Actual": y_train})
rf_testing_results_df = pd.DataFrame({"Prediction": rf_testing_predictions, "Actual": y_test})

# Display the results
# print("Logistic Regression Training Results:")
# print(logistic_training_results_df.head())
# print("Logistic Regression Testing Results:")
# print(logistic_testing_results_df.head())

# print("Random Forest Training Results:")
# print(rf_training_results_df.head())
# print("Random Forest Testing Results:")
# print(rf_testing_results_df.head())

# Calculate accuracy Scores and confusion matrix and display them
from sklearn.metrics import accuracy_score, confusion_matrix

# Logistic Regression
logistic_training_accuracy = accuracy_score(logistic_training_predictions, y_train)
logistic_testing_accuracy = accuracy_score(logistic_testing_predictions, y_test)
logistic_confusion_matrix = confusion_matrix(y_test, logistic_testing_predictions)

# Random Forest
rf_training_accuracy = accuracy_score(rf_training_predictions, y_train)
rf_testing_accuracy = accuracy_score(rf_testing_predictions, y_test)
rf_confusion_matrix = confusion_matrix(y_test, rf_testing_predictions)

# Display the results
print("Logistic Regression Training Accuracy:", logistic_training_accuracy)
print("Logistic Regression Testing Accuracy:", logistic_testing_accuracy)
print("Logistic Regression Confusion Matrix:")
print(logistic_confusion_matrix)
print("\n")
print("Random Forest Training Accuracy:", rf_training_accuracy)
print("Random Forest Testing Accuracy:", rf_testing_accuracy)
print("Random Forest Confusion Matrix:")
print(rf_confusion_matrix)


Logistic Regression Training Data Score: 0.7344838260331218
Logistic Regression Testing Data Score: 0.7338285360569483
Random Forest Training Data Score: 0.96761337254295
Random Forest Testing Data Score: 0.7104611575363664
Logistic Regression Training Accuracy: 0.7344838260331218
Logistic Regression Testing Accuracy: 0.7338285360569483
Logistic Regression Confusion Matrix:
[[2081  994]
 [ 726 2661]]


Random Forest Training Accuracy: 0.96761337254295
Random Forest Testing Accuracy: 0.7104611575363664
Random Forest Confusion Matrix:
[[1966 1109]
 [ 762 2625]]
