In [1]:
import data_outcomes as data

# Dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score
import statsmodels.api as sm

import pandas as pd
import datetime
import numpy as np
from scipy.stats import zscore
import scipy.stats as stats

In [2]:
df = data.getDiabetesDataDataframe()
display(df.head())

----> Retrieving information for Resources/diabetes.csv
----> Renaming DiabetesPedigreeFunction column to FamilyHistory
----> Mean will be substituted for 0 values
----> 121 substituted for 0 values in Glucose
----> 72 substituted for 0 values in BloodPressure
----> 29 substituted for 0 values in SkinThickness
----> 155 substituted for 0 values in Insulin
----> 32 substituted for 0 values in BMI


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,FamilyHistory,Age,Outcome
0,6,148,72,35,155,33,0.627,50,1
1,1,85,66,29,155,26,0.351,31,0
2,8,183,64,29,155,23,0.672,32,1
3,1,89,66,23,94,28,0.167,21,0
4,0,137,40,35,168,43,2.288,33,1


In [3]:
dfb = data.getDiabetesBehaviorDataframe()
display(dfb.head())

----> Retrieving information for Resources/diabetes_data.csv


Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,1.0,18.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,7.0,0.0,0.0,0.0,0.0,0.0
8,3.0,0.0,0.0,1.0,32.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
10,12.0,0.0,1.0,1.0,24.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0,0.0
14,10.0,0.0,1.0,1.0,29.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [4]:
# Split training and testing sets
# Create the features DataFrame, X
X = dfb.copy()
X = X.drop(columns=['Diabetes'])
y = dfb['Diabetes']

In [5]:
display(X.head())

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,1.0,18.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,7.0,0.0,0.0,0.0,0.0
8,3.0,0.0,0.0,1.0,32.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
10,12.0,0.0,1.0,1.0,24.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,1.0
14,10.0,0.0,1.0,1.0,29.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [6]:


# Use train_test_split to separate the data
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
logistic_regression_model = LogisticRegression()

# Fit the model
logistic_regression_model.fit(X_train_scaled, y_train)

# Score the model
print(f"Training Data Score: {logistic_regression_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model.score(X_test_scaled, y_test)}")

# Generate predictions from the model we just fit
training_predictions = logistic_regression_model.predict(X_train_scaled)

# Convert those predictions (and actual values) to a DataFrame
training_results_df = pd.DataFrame({"Prediction": training_predictions, "Actual": y_train})

# Apply the fitted model to the `test` dataset
testing_predictions = logistic_regression_model.predict(X_test_scaled)

# Save both the test predictions and actual test values to a DataFrame
testing_results = pd.DataFrame({
    "Testing Data Predictions": testing_predictions, 
    "Testing Data Actual Targets": y_test})


Training Data Score: 0.7594567369481399
Testing Data Score: 0.7633635511097218


In [7]:
#lr = sm.Logit(y_train, X_train_scaled).fit(method='bfgs')
#lr.pvalues.sort_values()
#lr.pvalues

In [8]:
accuracy = accuracy_score(y_test, testing_predictions)
confusion = confusion_matrix(y_test, testing_predictions, labels=[1,0])
classification = classification_report(y_test, testing_predictions, labels = [1, 0])
balanced_accuracy = balanced_accuracy_score(y_test, testing_predictions)
r2 = r2_score(y_test, testing_predictions)
cross_val = cross_val_score(LogisticRegression(), X_train_scaled, y_train, scoring = "r2")


In [11]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)
adj_score = r2_adj(X_test_scaled, y_test, logistic_regression_model)

In [12]:
print(f'accuracy: {accuracy}')
print(f'confusion: {confusion}')
print(f'classification: {classification}')
print(f'balanced_accuracy: {balanced_accuracy}')
print(f'r2: {r2}')
print(f'r2_adj: {adj_score}')
print(f'cross_val: {cross_val}')

accuracy: 0.7633635511097218
confusion: [[3508 1101]
 [1170 3818]]
classification:               precision    recall  f1-score   support

           1       0.75      0.76      0.76      4609
           0       0.78      0.77      0.77      4988

    accuracy                           0.76      9597
   macro avg       0.76      0.76      0.76      9597
weighted avg       0.76      0.76      0.76      9597

balanced_accuracy: 0.7632782988132246
r2: 0.05197568545067932
r2_adj: 0.7629435887304405
cross_val: [ 0.04807089 -0.0027265   0.07587861  0.03760551  0.00613222]
