# Importing Dataset and Libraries

In [None]:
import math
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [None]:
allRound_df = pd.read_csv('diabetes_data.csv')

allRound_df = allRound_df[['Age', 'Sex', 'Smoker', 'HeartDiseaseorAttack', 'PhysActivity', 'GenHlth', 'MentHlth', 'PhysHlth', 'Stroke', 'HighBP', 'Diabetes']]

stroke_df = pd.read_csv('stroke_data.csv')

diabetes_df = pd.read_csv('diabetes_data.csv')
diabetes_df = diabetes_df[['Age', 'Sex', 'HighChol', 'CholCheck', 'BMI', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'Diabetes']]

hypertension_df = pd.read_csv('hypertension_data.csv')

# All-Round Prediction

In [None]:
print(allRound_df.head())

    Age  Sex  Smoker  HeartDiseaseorAttack  PhysActivity  GenHlth  MentHlth  \
0   4.0  1.0     0.0                   0.0           1.0      3.0       5.0   
1  12.0  1.0     1.0                   0.0           0.0      3.0       0.0   
2  13.0  1.0     0.0                   0.0           1.0      1.0       0.0   
3  11.0  1.0     1.0                   0.0           1.0      3.0       0.0   
4   8.0  0.0     1.0                   0.0           1.0      2.0       0.0   

   PhysHlth  Stroke  HighBP  Diabetes  
0      30.0     0.0     1.0       0.0  
1       0.0     1.0     1.0       0.0  
2      10.0     0.0     0.0       0.0  
3       3.0     0.0     1.0       0.0  
4       0.0     0.0     0.0       0.0  


In [None]:
allRound_df.corr()[['Stroke', 'HighBP', 'Diabetes']]

Unnamed: 0,Stroke,HighBP,Diabetes
Age,0.123879,0.338132,0.278738
Sex,0.003822,0.040819,0.044413
Smoker,0.064658,0.087438,0.085999
HeartDiseaseorAttack,0.223394,0.21075,0.211523
PhysActivity,-0.079985,-0.136102,-0.158666
GenHlth,0.189447,0.32054,0.407612
MentHlth,0.087303,0.064294,0.087029
PhysHlth,0.164488,0.173922,0.213081
Stroke,1.0,0.12906,0.125427
HighBP,0.12906,1.0,0.381516


In [None]:
null_counts = allRound_df.isnull().sum()
print("\nNull counts in each column:")
print(null_counts)

allRound_df = allRound_df.dropna()


Null counts in each column:
Age                     0
Sex                     0
Smoker                  0
HeartDiseaseorAttack    0
PhysActivity            0
GenHlth                 0
MentHlth                0
PhysHlth                0
Stroke                  0
HighBP                  0
Diabetes                0
dtype: int64


**Min-max Normalization:**

In [None]:
allRound_df = (allRound_df - allRound_df.min()) / (allRound_df.max() - allRound_df.min())

**Train-test split:**

In [None]:
common_X_train, common_X_test, common_y_train, common_y_test = train_test_split(allRound_df.drop(['Stroke', 'Diabetes', 'HighBP'], axis = 1), allRound_df[['Diabetes', 'Stroke', 'HighBP']], test_size=0.2, random_state=1)

# Model Selection based on Accuracies:

In [None]:
commonDiabetesCLF1 = LogisticRegression(random_state=0).fit(common_X_train, common_y_train['Diabetes'])
print("Logistic Regression Prediction Score: ", commonDiabetesCLF1.score(common_X_test, common_y_test['Diabetes']))

commonDiabetesCLF2 = SGDClassifier(loss="hinge", penalty="l2", max_iter=5).fit(common_X_train, common_y_train['Diabetes'])
print("SGDClassifier Prediction Score: ", commonDiabetesCLF2.score(common_X_test, common_y_test['Diabetes']))

commonDiabetesCLF3 = GaussianNB().fit(common_X_train, common_y_train['Diabetes'])
print("Gaussian Naive Bayes Prediction Score: ", commonDiabetesCLF3.score(common_X_test, common_y_test['Diabetes']))

commonDiabetesCLF4 = tree.DecisionTreeClassifier().fit(common_X_train, common_y_train['Diabetes'])
print("Decision Tree Classifier Prediction Score: ", commonDiabetesCLF4.score(common_X_test, common_y_test['Diabetes']))

commonDiabetesCLF5 = RandomForestClassifier(n_estimators=10).fit(common_X_train, common_y_train['Diabetes'])
print("Random Forest Classifier Prediction Score: ", commonDiabetesCLF5.score(common_X_test, common_y_test['Diabetes']))

Logistic Regression Prediction Score:  0.7115779050852252
SGDClassifier Prediction Score:  0.7152556757903671
Gaussian Naive Bayes Prediction Score:  0.6648277813140958




Decision Tree Classifier Prediction Score:  0.6762147252280926
Random Forest Classifier Prediction Score:  0.6897941862932315


**As observed above, Logistic Regression shows the best accuracy score compared to rest of the classifier. Thus, we use Logistic Regression as the final classifier for all of our classification tasks ahead.**

**Final Classification Models:**

In [None]:
commonDiabetesCLF = LogisticRegression(random_state=0).fit(common_X_train, common_y_train['Diabetes'])
commonStrokeCLF = LogisticRegression(random_state=0).fit(common_X_train, common_y_train['Stroke'])
commonHighBPCLF = LogisticRegression(random_state=0).fit(common_X_train, common_y_train['HighBP'])

In [None]:
print("All-Round Diabetes Prediction Score: ", commonDiabetesCLF.score(common_X_test, common_y_test['Diabetes']))
print("All-Round Stroke Prediction Score: ", commonStrokeCLF.score(common_X_test, common_y_test['Stroke']))
print("All-Round Hypertension Prediction Score: ", commonHighBPCLF.score(common_X_test, common_y_test['HighBP']))

All-Round Diabetes Prediction Score:  0.7115779050852252
All-Round Stroke Prediction Score:  0.9383266143291604
All-Round Hypertension Prediction Score:  0.704575995473513


# Stroke Prediction

In [None]:
print(stroke_df.head())

   sex   age  hypertension  heart_disease  ever_married  work_type  \
0  1.0  63.0             0              1             1          4   
1  1.0  42.0             0              1             1          4   
2  0.0  61.0             0              0             1          4   
3  1.0  41.0             1              0             1          3   
4  1.0  85.0             0              0             1          4   

   Residence_type  avg_glucose_level   bmi  smoking_status  stroke  
0               1             228.69  36.6               1       1  
1               0             105.92  32.5               0       1  
2               1             171.23  34.4               1       1  
3               0             174.12  24.0               0       1  
4               1             186.21  29.0               1       1  


In [None]:
stroke_df.corr()['stroke']

sex                 -0.111036
age                  0.058534
hypertension         0.257182
heart_disease        0.223930
ever_married         0.181656
work_type            0.026494
Residence_type       0.011682
avg_glucose_level    0.265452
bmi                  0.018326
smoking_status       0.068368
stroke               1.000000
Name: stroke, dtype: float64

In [None]:
null_counts = stroke_df.isnull().sum()
print("\nNull counts in each column:")
print(null_counts)

stroke_df = stroke_df.dropna()


Null counts in each column:
sex                  3
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


**Min-max Normalization:**

In [None]:
stroke_df = (stroke_df - stroke_df.min()) / (stroke_df.max() - stroke_df.min())

**Train-test split:**

In [None]:
stroke_X_train, stroke_X_test, stroke_y_train, stroke_y_test = train_test_split(stroke_df.drop(['stroke'], axis = 1), stroke_df['stroke'], test_size=0.3, random_state=42)

**Classification Model:**

In [None]:
strokeCLF = LogisticRegression(random_state=0).fit(stroke_X_train, stroke_y_train)

print("Stroke Prediction Score: ", strokeCLF.score(stroke_X_test, stroke_y_test))

Stroke Prediction Score:  0.6821478041228713


# Diabetes Prediction

In [None]:
print(diabetes_df.head())

    Age  Sex  HighChol  CholCheck   BMI  Fruits  Veggies  HvyAlcoholConsump  \
0   4.0  1.0       0.0        1.0  26.0     0.0      1.0                0.0   
1  12.0  1.0       1.0        1.0  26.0     1.0      0.0                0.0   
2  13.0  1.0       0.0        1.0  26.0     1.0      1.0                0.0   
3  11.0  1.0       1.0        1.0  28.0     1.0      1.0                0.0   
4   8.0  0.0       0.0        1.0  29.0     1.0      1.0                0.0   

   Diabetes  
0       0.0  
1       0.0  
2       0.0  
3       0.0  
4       0.0  


In [None]:
diabetes_df.corr()['Diabetes']

Age                  0.278738
Sex                  0.044413
HighChol             0.289213
CholCheck            0.115382
BMI                  0.293373
Fruits              -0.054077
Veggies             -0.079293
HvyAlcoholConsump   -0.094853
Diabetes             1.000000
Name: Diabetes, dtype: float64

In [None]:
null_counts = diabetes_df.isnull().sum()
print("\nNull counts in each column:")
print(null_counts)

diabetes_df = diabetes_df.dropna()


Null counts in each column:
Age                  0
Sex                  0
HighChol             0
CholCheck            0
BMI                  0
Fruits               0
Veggies              0
HvyAlcoholConsump    0
Diabetes             0
dtype: int64


**Min-max Normalization:**

In [None]:
diabetes_df = (diabetes_df - diabetes_df.min()) / (diabetes_df.max() - diabetes_df.min())

**Train-test split:**

In [None]:
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(diabetes_df.drop(['Diabetes'], axis = 1), diabetes_df['Diabetes'], test_size=0.3, random_state=42)

**Classification Model:**

In [None]:
diabetesCLF = LogisticRegression(random_state=0).fit(diabetes_X_train, diabetes_y_train)

print("Diabetes Prediction Score: ", diabetesCLF.score(diabetes_X_test, diabetes_y_test))

Diabetes Prediction Score:  0.7032723500565824


# Hypertension Prediction

In [None]:
print(hypertension_df.head())

    age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0  57.0  1.0   3       145   233    1        0      150      0      2.3   
1  64.0  0.0   2       130   250    0        1      187      0      3.5   
2  52.0  1.0   1       130   204    0        0      172      0      1.4   
3  56.0  0.0   1       120   236    0        1      178      0      0.8   
4  66.0  0.0   0       120   354    0        1      163      1      0.6   

   slope  ca  thal  target  
0      0   0     1       1  
1      0   0     2       1  
2      2   0     2       1  
3      2   0     2       1  
4      2   0     2       1  


In [None]:
hypertension_df.corr()['target']

age        -2.354715e-02
sex         4.951044e-16
cp          4.338210e-01
trestbps   -1.485082e-01
chol       -8.211748e-02
fbs        -3.358049e-02
restecg     1.360681e-01
thalach     4.190405e-01
exang      -4.381167e-01
oldpeak    -4.381349e-01
slope       3.498362e-01
ca         -4.059577e-01
thal       -3.579344e-01
target      1.000000e+00
Name: target, dtype: float64

In [None]:
null_counts = hypertension_df.isnull().sum()
print("\nNull counts in each column:")
print(null_counts)

hypertension_df = hypertension_df.dropna()


Null counts in each column:
age          0
sex         25
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalach      0
exang        0
oldpeak      0
slope        0
ca           0
thal         0
target       0
dtype: int64


**Min-max Normalization:**

In [None]:
hypertension_df = (hypertension_df - hypertension_df.min()) / (hypertension_df.max() - hypertension_df.min())

**Train-test split:**

In [None]:
hypertension_X_train, hypertension_X_test, hypertension_y_train, hypertension_y_test = train_test_split(hypertension_df.drop(['target'], axis = 1), hypertension_df['target'], test_size=0.3, random_state=42)

**Classification Model:**

In [None]:
clf = LogisticRegression(random_state=0).fit(hypertension_X_train, hypertension_y_train)

print("Hypertension Prediction Score: ", clf.score(hypertension_X_test, hypertension_y_test))

Hypertension Prediction Score:  0.85597339473011
