In [4]:
#!pip install azure-storage-blob pandas

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)
Collecting azure-core>=1.30.0 (from azure-storage-blob)
  Downloading azure_core-1.33.0-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading azure_storage_blob-12.25.1-py3-none-any.whl (406 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m407.0/407.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading azure_core-1.33.0-py3-none-any.whl (207 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.1/207.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.33.0 az

In [75]:
!pip install -U kaleido



In [66]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
import io
import plotly.express as px
import numpy as np
import os

In [67]:
STORAGE_ACCOUNT_NAME = os.getenv('AZURE_STORAGE_ACCOUNT')
STORAGE_ACCOUNT_KEY = os.getenv('AZURE_STORAGE_KEY')
CONTAINER_NAME = os.getenv('AZURE_CONTAINER_NAME')

connection_string = f"DefaultEndpointsProtocol=https;AccountName={STORAGE_ACCOUNT_NAME};AccountKey={STORAGE_ACCOUNT_KEY};EndpointSuffix=core.windows.net"

blob_service_client = BlobServiceClient.from_connection_string(connection_string)

container_client = blob_service_client.get_container_client(CONTAINER_NAME)
blob_list = container_client.list_blobs()

print("Files in Blob Storage:")
for blob in blob_list:
    print(blob.name)

In [68]:
container_client = blob_service_client.get_container_client(CONTAINER_NAME)
blob_list = container_client.list_blobs()

print("Files in Blob Storage:")
for blob in blob_list:
    print(blob.name)

Files in Blob Storage:
breast_cancer.csv
chronic_kidney_disease.csv
early_diabetes.csv
heart_disease.csv
liver.csv
maternal_health_risk.csv
obesity.csv


## Breast Cancer

In [69]:
blob_client = container_client.get_blob_client("breast_cancer.csv")

downloaded_blob = blob_client.download_blob()
csv_data = downloaded_blob.readall()

df_breastCancer = pd.read_csv(io.StringIO(csv_data.decode('utf-8')))
print(df_breastCancer.head(10))

   Age        BMI  Glucose  Insulin      HOMA   Leptin  Adiponectin  Resistin  \
0   48  23.500000       70    2.707  0.467409   8.8071     9.702400   7.99585   
1   83  20.690495       92    3.115  0.706897   8.8438     5.429285   4.06405   
2   82  23.124670       91    4.498  1.009651  17.9393    22.432040   9.27715   
3   68  21.367521       77    3.226  0.612725   9.8827     7.169560  12.76600   
4   86  21.111111       92    3.549  0.805386   6.6994     4.819240  10.57635   
5   49  22.854458       92    3.226  0.732087   6.8317    13.679750  10.31760   
6   89  22.700000       77    4.690  0.890787   6.9640     5.589865  12.93610   
7   76  23.800000      118    6.470  1.883201   4.3110    13.251320   5.10420   
8   73  22.000000       97    3.350  0.801543   4.4700    10.358725   6.28445   
9   75  23.000000       83    4.952  1.013839  17.1270    11.578990   7.09130   

      MCP.1  Classification  
0   417.114               1  
1   468.786               1  
2   554.697       

In [70]:
print(df_breastCancer.isna().sum())

Age               0
BMI               0
Glucose           0
Insulin           0
HOMA              0
Leptin            0
Adiponectin       0
Resistin          0
MCP.1             0
Classification    0
dtype: int64


In [71]:
df_breastCancer.drop_duplicates(inplace=True)

## Checking the skewness and normalizing the data


In [78]:
df_check_skewness = df_breastCancer.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [12]:
df_breastCancer['MCP.1'] = np.log1p(df_breastCancer['MCP.1'])
df_breastCancer['Insulin'] = np.log1p(df_breastCancer['Insulin'])
print(df_breastCancer['MCP.1'])

0      6.035754
1      6.152277
2      6.320223
3      6.834346
4      6.652760
         ...   
111    5.595566
112    5.802602
113    5.752731
114    5.974979
115    4.511848
Name: MCP.1, Length: 116, dtype: float64


In [13]:
from scipy.stats import skew
print(df_breastCancer['MCP.1'].skew())

-0.5129520962079838


In [14]:
from scipy.stats import boxcox
df_breastCancer['MCP.1'], _ = boxcox(df_breastCancer['MCP.1'] + 1)

In [15]:
print(df_breastCancer['MCP.1'].skew())

-0.006831314822481881


In [16]:
print(df_breastCancer['Insulin'].skew())

0.9704792850895773


In [17]:
df_breastCancer['Insulin'], _ = boxcox(df_breastCancer['Insulin'] + 1)
print(df_breastCancer['Insulin'].skew())

0.08985001616072288


In [18]:
skew_values = df_breastCancer.skew().sort_values(ascending=False)
print(skew_values)

HOMA              3.812087
Glucose           2.593305
Resistin          2.577429
Adiponectin       1.817765
Leptin            1.309536
BMI               0.170152
Insulin           0.089850
Age               0.017832
MCP.1            -0.006831
Classification   -0.210748
dtype: float64


In [19]:
skewed_cols = ['Glucose', 'Leptin', 'HOMA', 'Adiponectin' , 'Resistin']

for col in skewed_cols:
    if (df_breastCancer[col] > 0).all():
        df_breastCancer[col], _ = boxcox(df_breastCancer[col])
    else:
        from sklearn.preprocessing import PowerTransformer
        pt = PowerTransformer(method='yeo-johnson')
        df_breastCancer[col] = pt.fit_transform(df_breastCancer[[col]])

print(df_breastCancer[skewed_cols].skew())

Glucose       -0.131235
Leptin         0.004261
HOMA           0.117326
Adiponectin   -0.000177
Resistin       0.023984
dtype: float64


## After removing the outliers

In [20]:
df_check_skewness = df_breastCancer.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [21]:
from sklearn.model_selection import train_test_split

X = df_breastCancer.drop(columns=['Classification'])
y = df_breastCancer['Classification']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')

Logistic Regression Accuracy: 0.7586


In [24]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='gini', max_depth=2, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')

Decision Tree Accuracy: 0.6552


In [25]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')

Random Forest Accuracy: 0.7586


In [26]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=4, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')


Gradient Boosting Accuracy: 0.7586


In [27]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
y_pred_svm = model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f'SVM Accuracy: {svm_accuracy:.4f}')
print(y_pred_svm)

SVM Accuracy: 0.8621
[2 1 1 1 1 1 2 1 2 1 1 1 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 2 2]


In [28]:
models = ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'SVM']
accuracy_scores = [lr_accuracy, dt_accuracy, rf_accuracy, gb_accuracy, svm_accuracy]

df = pd.DataFrame({"Model": models, "Accuracy": accuracy_scores})

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig = px.bar(df,
             x="Model",
             y="Accuracy",
             title="Comparison of Model Accuracy Scores",
             color="Model",
             color_discrete_map={
                 'Logistic Regression': '#636EFA',
                 'Decision Trees': '#EF553B',
                 'Random Forest': '#00CC96',
                 'Gradient Boosting': '#AB63FA',
                 'SVM': '#FFA15A'
             })
fig.update_layout(
    width=800,
    height=500,
    yaxis=dict(range=[0.4, 0.9]),
    xaxis=dict(tickangle=20),
    xaxis_title="Models",
    yaxis_title="Accuracy Score",
    template="plotly_white",
    font=dict(family="Arial", size=14)
)

fig.show()

##Kidney diseases

In [29]:
blob_client = container_client.get_blob_client("chronic_kidney_disease.csv")

downloaded_blob_kidney = blob_client.download_blob()
csv_data_kidney = downloaded_blob_kidney.readall()

df_kidneydiseases = pd.read_csv(io.StringIO(csv_data_kidney.decode('utf-8')))
print(df_kidneydiseases.head(10))

    age     bp     sg   al   su       rbc        pc         pcc          ba  \
0  48.0   80.0  1.020  1.0  0.0       NaN    normal  notpresent  notpresent   
1   7.0   50.0  1.020  4.0  0.0       NaN    normal  notpresent  notpresent   
2  62.0   80.0  1.010  2.0  3.0    normal    normal  notpresent  notpresent   
3  48.0   70.0  1.005  4.0  0.0    normal  abnormal     present  notpresent   
4  51.0   80.0  1.010  2.0  0.0    normal    normal  notpresent  notpresent   
5  60.0   90.0  1.015  3.0  0.0       NaN       NaN  notpresent  notpresent   
6  68.0   70.0  1.010  0.0  0.0       NaN    normal  notpresent  notpresent   
7  24.0    NaN  1.015  2.0  4.0    normal  abnormal  notpresent  notpresent   
8  52.0  100.0  1.015  3.0  0.0    normal  abnormal     present  notpresent   
9  53.0   90.0  1.020  2.0  0.0  abnormal  abnormal     present  notpresent   

     bgr  ...   pcv     wbcc  rbcc  htn   dm  cad  appet   pe  ane class  
0  121.0  ...  44.0   7800.0   5.2  yes  yes   no   goo

In [30]:
print(df_kidneydiseases.isna().sum())

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wbcc     106
rbcc     131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
class      0
dtype: int64


In [31]:
num_cols = ['age', 'bp', 'sg' , 'al' , 'su' , 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']
df_kidneydiseases[num_cols] = df_kidneydiseases[num_cols].fillna(df_kidneydiseases[num_cols].mean())

cat_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
df_kidneydiseases[cat_cols] = df_kidneydiseases[cat_cols].fillna(df_kidneydiseases[cat_cols].mode().iloc[0])

In [32]:
print(df_kidneydiseases.isna().sum())

age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64


In [33]:
df_skew = df_kidneydiseases.select_dtypes(include=['number'])
skew_values_kidney = df_skew.skew().sort_values(ascending=False)
print(skew_values_kidney)

pot     13.101157
sc       7.673105
bu       2.698754
su       2.629266
bgr      2.130422
wbcc     1.888887
bp       1.629876
al       1.060509
sg      -0.183473
rbcc    -0.223145
hemo    -0.359057
pcv     -0.477800
age     -0.675848
sod     -7.901094
dtype: float64


In [34]:
df_check_skewness = df_kidneydiseases.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [35]:
kidney_skewed_cols = ['bp', 'sg' , 'al' , 'su' , 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']

for col in kidney_skewed_cols:
    if (df_kidneydiseases[col] > 0).all():
        df_kidneydiseases[col], _ = boxcox(df_kidneydiseases[col])
    else:
        from sklearn.preprocessing import PowerTransformer
        pt = PowerTransformer(method='yeo-johnson')
        df_kidneydiseases[col] = pt.fit_transform(df_kidneydiseases[[col]])

print(df_kidneydiseases[kidney_skewed_cols].skew())

bp     -0.018854
sg     -0.032458
al      0.248978
su      1.105646
bgr    -0.134214
bu     -0.025444
sc      0.100012
sod     0.091238
pot    -0.395817
hemo   -0.058426
pcv    -0.050693
wbcc    0.036590
rbcc    0.070667
dtype: float64


In [36]:
df_kidneydiseases['su'] = np.log1p(df_kidneydiseases['su'])
print(df_kidneydiseases['su'].skew())

1.0321124188823934


In [37]:
df_check_skewness = df_kidneydiseases.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [38]:
df_kidneydiseases = pd.get_dummies(df_kidneydiseases, columns=['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'], drop_first=True)

In [39]:
print(df_kidneydiseases.head())

    age        bp        sg        al        su       bgr        bu        sc  \
0  48.0  1.462256  0.024063  0.508398 -0.933877  2.197165  3.181531  0.175504   
1   7.0  1.429473  0.024063  1.571189 -0.933877  2.228176  2.624877 -0.233955   
2  62.0  1.462256  0.010958  1.070153  1.077438  2.356087  3.480962  0.520740   
3  48.0  1.453927  0.005233  1.571189 -0.933877  2.191759  3.522953  1.021409   
4  51.0  1.462256  0.010958  1.070153 -0.933877  2.175474  2.923445  0.313739   

            sod       pot  ...  pc_normal  pcc_present  ba_present  htn_yes  \
0  3.025783e+09  0.759604  ...       True        False       False     True   
1  3.025783e+09  0.759604  ...       True        False       False    False   
2  3.025783e+09  0.759604  ...       True        False       False    False   
3  1.093315e+09  0.587370  ...      False         True       False     True   
4  3.025783e+09  0.759604  ...       True        False       False    False   

   dm_no  dm_yes  cad_yes  appet_poor 

In [40]:
X = df_kidneydiseases.drop(columns=['class'])
y = df_kidneydiseases['class']

#Standardizing the data

In [41]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)

print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.62


In [44]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=4,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)
dt.fit(X_train, y_train)
y_pred_dt_train = dt.predict(X_train)
y_pred_dt = dt.predict(X_test)

dt_train_acc = accuracy_score(y_pred_dt_train, y_train)
dt_test_acc = accuracy_score(y_test, y_pred_dt)

print(f"Train Accuracy: {dt_train_acc:.4f}")
print(f"Test Accuracy : {dt_test_acc:.4f}")

Train Accuracy: 0.9800
Test Accuracy : 0.9500


In [45]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')

Gradient Boosting Accuracy: 0.9300


In [46]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,
                                min_samples_split=10, min_samples_leaf=7, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf_train = rf.predict(X_train)
y_pred_rf = rf.predict(X_test)

rf_train_acc = accuracy_score(y_pred_rf_train, y_train)
rf_test_acc = accuracy_score(y_test, y_pred_rf)

print(f"Train Accuracy: {rf_train_acc:.4f}")
print(f"Test Accuracy : {rf_test_acc:.4f}")
print(y_pred_rf)

Train Accuracy: 0.9900
Test Accuracy : 0.9500
['ckd' 'ckd' 'ckd' 'ckd' 'notckd' 'ckd' 'notckd' 'ckd' 'notckd' 'ckd'
 'ckd' 'ckd' 'ckd' 'notckd' 'notckd' 'notckd' 'ckd' 'notckd' 'ckd' 'ckd'
 'notckd' 'ckd' 'notckd' 'ckd' 'notckd' 'ckd' 'notckd' 'ckd' 'ckd' 'ckd'
 'notckd' 'ckd' 'ckd' 'notckd' 'notckd' 'ckd' 'ckd' 'notckd' 'ckd'
 'notckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'notckd' 'ckd'
 'notckd' 'ckd' 'ckd' 'notckd' 'notckd' 'notckd' 'ckd' 'notckd' 'ckd'
 'ckd' 'ckd' 'notckd' 'notckd' 'notckd' 'ckd' 'ckd' 'ckd' 'ckd' 'notckd'
 'notckd' 'ckd' 'ckd' 'ckd' 'notckd' 'notckd' 'ckd' 'ckd' 'ckd' 'ckd'
 'ckd' 'ckd' 'ckd' 'notckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'notckd' 'ckd'
 'ckd' 'notckd' 'notckd' 'ckd' 'ckd' 'ckd' 'ckd' 'ckd' 'notckd' 'ckd'
 'notckd']


In [47]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
y_pred_svm = model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f'SVM Accuracy: {svm_accuracy:.4f}')

SVM Accuracy: 0.7800


In [48]:
models = ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'SVM']
accuracy_scores = [lr_accuracy, dt_test_acc, rf_test_acc, gb_accuracy, svm_accuracy]

df = pd.DataFrame({"Model": models, "Accuracy": accuracy_scores})

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig = px.bar(df,
             x="Model",
             y="Accuracy",
             title="Comparison of Model Accuracy Scores",
             color="Model",
             color_discrete_map={
                 'Logistic Regression': '#636EFA',
                 'Decision Trees': '#EF553B',
                 'Random Forest': '#00CC96',
                 'Gradient Boosting': '#AB63FA',
                 'SVM': '#FFA15A'
             })

fig.update_layout(
    width=800,
    height=500,
    yaxis=dict(range=[0.4, 1.0]),
    xaxis=dict(tickangle=20),
    xaxis_title="Models",
    yaxis_title="Accuracy Score",
    template="plotly_white",
    font=dict(family="Arial", size=14)
)

fig.show()

## Diabetes

In [49]:
blob_client = container_client.get_blob_client("early_diabetes.csv")

downloaded_blob_diabetes_early = blob_client.download_blob()
csv_data_diabetes_early = downloaded_blob_diabetes_early.readall()

df_diabetes_early = pd.read_csv(io.StringIO(csv_data_diabetes_early.decode('utf-8')))
print(df_diabetes_early.head(10))

   age gender polyuria polydipsia sudden_weight_loss weakness polyphagia  \
0   40   Male       No        Yes                 No      Yes         No   
1   58   Male       No         No                 No      Yes         No   
2   41   Male      Yes         No                 No      Yes        Yes   
3   45   Male       No         No                Yes      Yes        Yes   
4   60   Male      Yes        Yes                Yes      Yes        Yes   
5   55   Male      Yes        Yes                 No      Yes        Yes   
6   57   Male      Yes        Yes                 No      Yes        Yes   
7   66   Male      Yes        Yes                Yes      Yes         No   
8   67   Male      Yes        Yes                 No      Yes        Yes   
9   70   Male       No        Yes                Yes      Yes        Yes   

  genital_thrush visual_blurring itching irritability delayed_healing  \
0             No              No     Yes           No             Yes   
1             No 

In [50]:
print(df_diabetes_early.isna().sum())

age                   0
gender                0
polyuria              0
polydipsia            0
sudden_weight_loss    0
weakness              0
polyphagia            0
genital_thrush        0
visual_blurring       0
itching               0
irritability          0
delayed_healing       0
partial_paresis       0
muscle_stiffness      0
alopecia              0
obesity               0
class                 0
dtype: int64


In [51]:
df_diabetes_early = pd.get_dummies(df_diabetes_early, drop_first=True).astype(int)

In [52]:
df_diabetes_early.columns = df_diabetes_early.columns.str.replace('_Yes', '')
df_diabetes_early.columns = df_diabetes_early.columns.str.replace('_Male', '')
df_diabetes_early.columns = df_diabetes_early.columns.str.replace('_Positive', '')

In [53]:
df_diabetes_early.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [54]:
df_skew = df_diabetes_early.select_dtypes(include=['number'])
skew_values_diabetes = df_skew.skew().sort_values(ascending=False)
print(skew_values_diabetes)

obesity               1.769420
genital_thrush        1.334223
irritability          1.206305
alopecia              0.657608
muscle_stiffness      0.517893
sudden_weight_loss    0.336359
age                   0.329359
partial_paresis       0.280426
visual_blurring       0.209426
polydipsia            0.209426
polyphagia            0.178134
delayed_healing       0.162537
itching               0.054022
polyuria              0.015430
weakness             -0.352476
class                -0.475715
gender               -0.543509
dtype: float64


In [55]:
df_check_skewness = df_diabetes_early.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [56]:
X = df_diabetes_early.drop(columns=['class'])
y = df_diabetes_early['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train['age'] = scaler.fit_transform(X_train[['age']])
X_test['age'] = scaler.transform(X_test[['age']])

In [58]:
LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')

Logistic Regression Accuracy: 0.9231


In [59]:
DT = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=42)
DT.fit(X_train, y_train)
y_pred_dt = DT.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')

Decision Tree Accuracy: 0.9519


In [60]:
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)
y_pred_rf = RF.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')
print(y_pred_rf.shape)

Random Forest Accuracy: 0.9904
(104,)


In [61]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')

Gradient Boosting Accuracy: 0.9712


In [62]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {svm_accuracy:.4f}')

SVM Accuracy: 0.9712


In [63]:
models = ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'SVM']
accuracy_scores = [lr_accuracy, dt_accuracy, rf_accuracy, gb_accuracy, svm_accuracy]

df = pd.DataFrame({"Model": models, "Accuracy": accuracy_scores})

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig = px.bar(df,
             x="Model",
             y="Accuracy",
             title="Comparison of Model Accuracy Scores",
             color="Model",
             color_discrete_map={
                 'Logistic Regression': '#636EFA',
                 'Decision Trees': '#EF553B',
                 'Random Forest': '#00CC96',
                 'Gradient Boosting': '#AB63FA',
                 'SVM': '#FFA15A'
             })

fig.update_layout(
    width=800,
    height=500,
    yaxis=dict(range=[0.4, 1.0]),
    xaxis=dict(tickangle=20),
    xaxis_title="Models",
    yaxis_title="Accuracy Score",
    template="plotly_white",
    font=dict(family="Arial", size=14)
)

fig.show()

## Heart Disease

In [64]:
blob_client = container_client.get_blob_client("heart_disease.csv")

# Download blob data
downloaded_blob_heart = blob_client.download_blob()
csv_data_heart = downloaded_blob_heart.readall()

# Convert to pandas DataFrame
df_heart = pd.read_csv(io.StringIO(csv_data_heart.decode('utf-8')))
print(df_heart.head(10))

    age  sex  chest-pain  rest-bp  serum-chol  fasting-blood-sugar  \
0  70.0  1.0         4.0    130.0       322.0                  0.0   
1  67.0  0.0         3.0    115.0       564.0                  0.0   
2  57.0  1.0         2.0    124.0       261.0                  0.0   
3  64.0  1.0         4.0    128.0       263.0                  0.0   
4  74.0  0.0         2.0    120.0       269.0                  0.0   
5  65.0  1.0         4.0    120.0       177.0                  0.0   
6  56.0  1.0         3.0    130.0       256.0                  1.0   
7  59.0  1.0         4.0    110.0       239.0                  0.0   
8  60.0  1.0         4.0    140.0       293.0                  0.0   
9  63.0  0.0         4.0    150.0       407.0                  0.0   

   electrocardiographic  max-heart-rate  angina  oldpeak  slope  \
0                   2.0           109.0     0.0      2.4    2.0   
1                   2.0           160.0     0.0      1.6    2.0   
2                   0.0     

In [65]:
print(df_heart['chest-pain'].unique())

[4. 3. 2. 1.]


In [626]:
print(df_heart.isna().sum())

age                     0
sex                     0
chest-pain              0
rest-bp                 0
serum-chol              0
fasting-blood-sugar     0
electrocardiographic    0
max-heart-rate          0
angina                  0
oldpeak                 0
slope                   0
major-vessels           0
thal                    0
heart-disease           0
dtype: int64


In [627]:
df_skew = df_heart.select_dtypes(include=['number'])
skew_values_heart = df_skew.skew().sort_values(ascending=False)
print(skew_values_heart)

fasting-blood-sugar     1.991971
oldpeak                 1.262893
major-vessels           1.209890
serum-chol              1.183721
angina                  0.728915
rest-bp                 0.722618
slope                   0.543151
thal                    0.287268
heart-disease           0.224858
electrocardiographic   -0.044703
age                    -0.163615
max-heart-rate         -0.527737
sex                    -0.765084
chest-pain             -0.878767
dtype: float64


In [628]:
df_check_skewness = df_heart.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [629]:
df_heart['fasting-blood-sugar'] = np.log1p(df_heart['fasting-blood-sugar'])
df_heart['oldpeak'] = np.log1p(df_heart['oldpeak'])
df_heart['major-vessels'] = np.log1p(df_heart['major-vessels'])
df_heart['serum-chol'] = np.log1p(df_heart['serum-chol'])

df_heart['angina'] = np.sqrt(df_heart['angina'])
df_heart['rest-bp'] = np.sqrt(df_heart['rest-bp'])
df_heart['slope'] = np.sqrt(df_heart['slope'])

df_heart['sex'] = np.power(df_heart['sex'], 2)
df_heart['chest-pain'] = np.power(df_heart['chest-pain'], 2)
df_heart['max-heart-rate'] = np.power(df_heart['max-heart-rate'], 2)

# After removing the outliers

In [630]:
df_check_skewness = df_heart.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [631]:
X = df_heart.drop(columns=['heart-disease'])
y = df_heart['heart-disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [632]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [633]:
LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')
print(y_pred)

Logistic Regression Accuracy: 0.8519
[1 1 1 2 1 1 1 2 2 2 1 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 1 1 1 2 1 2 2 2 1 1 2
 1 1 1 2 1 2 2 2 1 2 2 2 2 2 2 1 2]


In [634]:
DT = DecisionTreeClassifier(criterion='gini', max_depth=8, random_state=42)
DT.fit(X_train, y_train)
y_pred_dt = DT.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')

Decision Tree Accuracy: 0.7963


In [635]:
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')

Random Forest Accuracy: 0.8148


In [636]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=6, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')

Gradient Boosting Accuracy: 0.8333


In [637]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {svm_accuracy:.4f}')

SVM Accuracy: 0.8333


In [638]:
import plotly.express as px
import pandas as pd


models = ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'SVM']
accuracy_scores = [lr_accuracy, dt_accuracy, rf_accuracy, gb_accuracy, svm_accuracy]
df = pd.DataFrame({"Model": models, "Accuracy": accuracy_scores})

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig = px.bar(df,
             x="Model",
             y="Accuracy",
             title="Comparison of Model Accuracy Scores",
             color="Model",
             color_discrete_map={
                 'Logistic Regression': '#636EFA',
                 'Decision Trees': '#EF553B',
                 'Random Forest': '#00CC96',
                 'Gradient Boosting': '#AB63FA',
                 'SVM': '#FFA15A'
             })

fig.update_layout(
    width=800,
    height=500,
    yaxis=dict(range=[0.4, 0.9]),
    xaxis=dict(tickangle=20),
    xaxis_title="Models",
    yaxis_title="Accuracy Score",
    template="plotly_white",
    font=dict(family="Arial", size=14)
)

fig.show()

##Liver

In [111]:
blob_client = container_client.get_blob_client("liver.csv")

downloaded_blob_liver = blob_client.download_blob()
csv_data_liver = downloaded_blob_liver.readall()

df_liver = pd.read_csv(io.StringIO(csv_data_liver.decode('utf-8')))
print(df_heart.head(10))

    age  sex  chest-pain    rest-bp  serum-chol  fasting-blood-sugar  \
0  70.0  1.0        16.0  11.401754    5.777652             0.000000   
1  67.0  0.0         9.0  10.723805    6.336826             0.000000   
2  57.0  1.0         4.0  11.135529    5.568345             0.000000   
3  64.0  1.0        16.0  11.313708    5.575949             0.000000   
4  74.0  0.0         4.0  10.954451    5.598422             0.000000   
5  65.0  1.0        16.0  10.954451    5.181784             0.000000   
6  56.0  1.0         9.0  11.401754    5.549076             0.693147   
7  59.0  1.0        16.0  10.488088    5.480639             0.000000   
8  60.0  1.0        16.0  11.832160    5.683580             0.000000   
9  63.0  0.0        16.0  12.247449    6.011267             0.000000   

   electrocardiographic  max-heart-rate  angina   oldpeak     slope  \
0                   2.0         11881.0     0.0  1.223775  1.414214   
1                   2.0         25600.0     0.0  0.955511  1.4142

In [112]:
print(df_liver.isna().sum())

Class               0
Age                 0
Sex                 0
Steroid             1
Antivirals          0
Fatigue             1
Malaise             1
Anorexia            1
Liver Big          10
Liver Firm         11
Spleen Palpable     5
Spiders             5
Ascites             5
Varices             5
Bilirubin           6
Alk Phosphate      29
Sgot                4
Albumin            16
Protime            67
Histology           0
dtype: int64


In [113]:
categorical_cols = ['Steroid', 'Fatigue', 'Malaise', 'Anorexia', 'Liver Big', 'Liver Firm', 'Spleen Palpable', 'Spiders', 'Ascites', 'Varices']
for col in categorical_cols:
    df_liver[col].fillna(df_liver[col].mode()[0], inplace=True)

numerical_cols = ['Bilirubin', 'Alk Phosphate', 'Sgot', 'Albumin', 'Protime']
for col in numerical_cols:
    df_liver[col].fillna(df_liver[col].median(), inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [114]:
print(df_liver.isna().sum())

Class              0
Age                0
Sex                0
Steroid            0
Antivirals         0
Fatigue            0
Malaise            0
Anorexia           0
Liver Big          0
Liver Firm         0
Spleen Palpable    0
Spiders            0
Ascites            0
Varices            0
Bilirubin          0
Alk Phosphate      0
Sgot               0
Albumin            0
Protime            0
Histology          0
dtype: int64


In [115]:
df_skew = df_liver.select_dtypes(include=['number'])
skew_values_heart = df_skew.skew().sort_values(ascending=False)
print(skew_values_heart)

Sgot               3.231320
Bilirubin          2.967489
Sex                2.633738
Alk Phosphate      1.666749
Fatigue            0.642651
Age                0.365294
Histology          0.196367
Protime            0.094513
Steroid           -0.039096
Albumin           -0.217016
Malaise           -0.440068
Liver Firm        -0.468129
Spiders           -0.734867
Class             -1.464700
Anorexia          -1.464700
Spleen Palpable   -1.566545
Liver Big         -1.859869
Antivirals        -1.926980
Ascites           -2.234862
Varices           -2.419832
dtype: float64


In [116]:
df_check_skewness = df_liver.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

### Splitting the features

In [118]:
continuous_cols = ['Bilirubin', 'Sgot', 'Alk Phosphate', 'Protime', 'Albumin', 'Age']

categorical_cols = [
    'Sex', 'Fatigue', 'Malaise', 'Steroid', 'Spiders', 'Histology', 'Antivirals',
    'Ascites', 'Varices', 'Anorexia', 'Spleen Palpable', 'Liver Firm', 'Liver Big'
]

target_col = 'Class'

In [119]:
from sklearn.preprocessing import PowerTransformer, MinMaxScaler

pt = PowerTransformer(method='yeo-johnson')
df_liver[continuous_cols] = pt.fit_transform(df_liver[continuous_cols])

scaler = MinMaxScaler()
df_liver[continuous_cols] = scaler.fit_transform(df_liver[continuous_cols])

In [120]:
df_skew = df_liver.select_dtypes(include=['number'])
skew_values_heart = df_skew.skew().sort_values(ascending=False)
print(skew_values_heart)

Sex                2.633738
Fatigue            0.642651
Histology          0.196367
Bilirubin          0.158820
Protime            0.131101
Albumin            0.105183
Sgot               0.032223
Age                0.015046
Alk Phosphate     -0.016143
Steroid           -0.039096
Malaise           -0.440068
Liver Firm        -0.468129
Spiders           -0.734867
Class             -1.464700
Anorexia          -1.464700
Spleen Palpable   -1.566545
Liver Big         -1.859869
Antivirals        -1.926980
Ascites           -2.234862
Varices           -2.419832
dtype: float64


In [121]:
df_check_skewness = df_liver.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [122]:
X = df_liver.drop(columns=['Class'])
y = df_liver['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [123]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [124]:
df_liver = pd.get_dummies(df_liver, columns=categorical_cols, drop_first=True)

In [125]:
LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')

Logistic Regression Accuracy: 0.8718


In [126]:
DT = DecisionTreeClassifier(criterion='gini', max_depth=6, random_state=42)
DT.fit(X_train, y_train)
y_pred_dt = DT.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')

Decision Tree Accuracy: 0.8462


In [127]:
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')

Random Forest Accuracy: 0.8205


In [128]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.25, max_depth=3, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')

Gradient Boosting Accuracy: 0.9231


In [129]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {svm_accuracy:.4f}')

SVM Accuracy: 0.8462


In [130]:
models = ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'SVM']
accuracy_scores = [lr_accuracy, dt_accuracy, rf_accuracy, gb_accuracy, svm_accuracy]
df = pd.DataFrame({"Model": models, "Accuracy": accuracy_scores})

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig = px.bar(df,
             x="Model",
             y="Accuracy",
             title="Comparison of Model Accuracy Scores",
             color="Model",
             color_discrete_map={
                 'Logistic Regression': '#636EFA',
                 'Decision Trees': '#EF553B',
                 'Random Forest': '#00CC96',
                 'Gradient Boosting': '#AB63FA',
                 'SVM': '#FFA15A'
             })

fig.update_layout(
    width=800,
    height=500,
    yaxis=dict(range=[0.4, 0.9]),
    xaxis=dict(tickangle=20),
    xaxis_title="Models",
    yaxis_title="Accuracy Score",
    template="plotly_white",
    font=dict(family="Arial", size=14)
)

fig.show()

## Maternal Health

In [184]:
blob_client = container_client.get_blob_client("maternal_health_risk.csv")

downloaded_blob_mh = blob_client.download_blob()
csv_data_mh = downloaded_blob_mh.readall()

df_mh = pd.read_csv(io.StringIO(csv_data_mh.decode('utf-8')))
print(df_mh.head(10))

   Age  SystolicBP  DiastolicBP     BS  BodyTemp  HeartRate  RiskLevel
0   25         130           80  15.00      98.0         86  high risk
1   35         140           90  13.00      98.0         70  high risk
2   29          90           70   8.00     100.0         80  high risk
3   30         140           85   7.00      98.0         70  high risk
4   35         120           60   6.10      98.0         76   low risk
5   23         140           80   7.01      98.0         70  high risk
6   23         130           70   7.01      98.0         78   mid risk
7   35          85           60  11.00     102.0         86  high risk
8   32         120           90   6.90      98.0         70   mid risk
9   42         130           80  18.00      98.0         70  high risk


In [185]:
print(df_mh.isna().sum())

Age            0
SystolicBP     0
DiastolicBP    0
BS             0
BodyTemp       0
HeartRate      0
RiskLevel      0
dtype: int64


In [186]:
df_skew = df_mh.select_dtypes(include=['number'])
skew_values_mh = df_skew.skew().sort_values(ascending=False)
print(skew_values_mh)

BS             1.868203
BodyTemp       1.750988
Age            0.783063
DiastolicBP   -0.048441
SystolicBP    -0.251189
HeartRate     -1.043525
dtype: float64


In [187]:
df_check_skewness = df_mh.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [188]:
pt = PowerTransformer(method='yeo-johnson')
df_mh[['BodyTemp', 'HeartRate']] = pt.fit_transform(df_mh[['BodyTemp', 'HeartRate']])

In [189]:
scaler = PowerTransformer()
scaled_features = scaler.fit_transform(df_mh[['Age', 'HeartRate']])
df_mh[['Age', 'HeartRate']] = scaled_features

In [190]:
df_check_skewness = df_mh.melt(var_name="Feature", value_name="Value")
fig = px.box(df_check_skewness,
             x="Feature",
             y="Value",
             color="Feature",
             width=1200,
             height=600)

fig.update_layout(xaxis=dict(tickangle=45))
fig.show()

In [191]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_mh['RiskLevel'] = label_encoder.fit_transform(df_mh['RiskLevel'])
print(df_mh.head(10))

        Age  SystolicBP  DiastolicBP     BS      BodyTemp  HeartRate  \
0 -0.153700         130           80  15.00 -1.415534e-15   1.538299   
1  0.601410         140           90  13.00 -1.415534e-15  -0.582224   
2  0.183614          90           70   8.00  3.080869e-15   0.719434   
3  0.259732         140           85   7.00 -1.415534e-15  -0.582224   
4  0.601410         120           60   6.10 -1.415534e-15   0.190816   
5 -0.346063         140           80   7.01 -1.415534e-15  -0.582224   
6 -0.346063         130           70   7.01 -1.415534e-15   0.453641   
7  0.601410          85           60  11.00  6.994405e-15   1.538299   
8  0.403670         120           90   6.90 -1.415534e-15  -0.582224   
9  0.996041         130           80  18.00 -1.415534e-15  -0.582224   

   RiskLevel  
0          0  
1          0  
2          0  
3          0  
4          1  
5          0  
6          2  
7          0  
8          2  
9          0  


In [192]:
X = df_mh.drop(columns=['RiskLevel'])
y = df_mh['RiskLevel']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [193]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [194]:
LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')

Logistic Regression Accuracy: 0.6496


In [195]:
DT = DecisionTreeClassifier(criterion='gini', max_depth=7, random_state=42)
DT.fit(X_train, y_train)
y_pred_dt = DT.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')

Decision Tree Accuracy: 0.7323


In [196]:
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')

Random Forest Accuracy: 0.8740


In [197]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.65, max_depth=7, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')

Gradient Boosting Accuracy: 0.8622


In [198]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {svm_accuracy:.4f}')

SVM Accuracy: 0.7087


In [199]:
models = ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'SVM']
accuracy_scores = [lr_accuracy, dt_accuracy, rf_accuracy, gb_accuracy, svm_accuracy]
df = pd.DataFrame({"Model": models, "Accuracy": accuracy_scores})

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig = px.bar(df,
             x="Model",
             y="Accuracy",
             title="Comparison of Model Accuracy Scores",
             color="Model",
             color_discrete_map={
                 'Logistic Regression': '#636EFA',
                 'Decision Trees': '#EF553B',
                 'Random Forest': '#00CC96',
                 'Gradient Boosting': '#AB63FA',
                 'SVM': '#FFA15A'
             })

fig.update_layout(
    width=800,
    height=500,
    yaxis=dict(range=[0.4, 0.9]),
    xaxis=dict(tickangle=20),
    xaxis_title="Models",
    yaxis_title="Accuracy Score",
    template="plotly_white",
    font=dict(family="Arial", size=14)
)

fig.show()

## Obesity

In [174]:
blob_client = container_client.get_blob_client("obesity.csv")

downloaded_blob_ob = blob_client.download_blob()
csv_data_ob = downloaded_blob_ob.readall()

df_ob = pd.read_csv(io.StringIO(csv_data_ob.decode('utf-8')))
print(df_ob.head(10))

   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.0   
3    Male  27.0    1.80    87.0                             no   no   3.0   
4    Male  22.0    1.78    89.8                             no   no   2.0   
5    Male  29.0    1.62    53.0                             no  yes   2.0   
6  Female  23.0    1.50    55.0                            yes  yes   3.0   
7    Male  22.0    1.64    53.0                             no   no   2.0   
8    Male  24.0    1.78    64.0                            yes  yes   3.0   
9    Male  22.0    1.72    68.0                            yes  yes   2.0   

   NCP       CAEC SMOKE  CH2O  SCC  FAF  TUE        CALC  \
0  3.0  Sometimes    no   2.0   no  0.0  1.0          no   
1  3.0  Sometimes   yes   3.0  y

In [175]:
print(df_ob.isna().sum())

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64


In [176]:
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
df_ob = pd.get_dummies(df_ob, columns=categorical_cols, drop_first=True)

label_encoder = LabelEncoder()
df_ob['NObeyesdad'] = label_encoder.fit_transform(df_ob['NObeyesdad'])

numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
scaler = StandardScaler()
df_ob[numerical_cols] = scaler.fit_transform(df_ob[numerical_cols])

In [177]:
X = df_ob.drop(columns=['NObeyesdad'])
y = df_ob['NObeyesdad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [178]:
LR = LogisticRegression()
LR.fit(X_train, y_train)

y_pred = LR.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Accuracy: {lr_accuracy:.4f}')

Logistic Regression Accuracy: 0.8807



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [179]:
DT = DecisionTreeClassifier(criterion='gini', max_depth=6, random_state=42)
DT.fit(X_train, y_train)
y_pred_dt = DT.predict(X_test)

dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')

Decision Tree Accuracy: 0.8598


In [180]:
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')

Random Forest Accuracy: 0.9413


In [181]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')

Gradient Boosting Accuracy: 0.9508


In [182]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {svm_accuracy:.4f}')

SVM Accuracy: 0.9261


In [183]:
models = ['Logistic Regression', 'Decision Trees', 'Random Forest', 'Gradient Boosting', 'SVM']
accuracy_scores = [lr_accuracy, dt_accuracy, rf_accuracy, gb_accuracy, svm_accuracy]
df = pd.DataFrame({"Model": models, "Accuracy": accuracy_scores})

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig = px.bar(df,
             x="Model",
             y="Accuracy",
             title="Comparison of Model Accuracy Scores",
             color="Model",
             color_discrete_map={
                 'Logistic Regression': '#636EFA',
                 'Decision Trees': '#EF553B',
                 'Random Forest': '#00CC96',
                 'Gradient Boosting': '#AB63FA',
                 'SVM': '#FFA15A'
             })

fig.update_layout(
    width=800,
    height=500,
    yaxis=dict(range=[0.4, 0.9]),
    xaxis=dict(tickangle=20),
    xaxis_title="Models",
    yaxis_title="Accuracy Score",
    template="plotly_white",
    font=dict(family="Arial", size=14)
)

fig.show()

## Predictions - Dowloading the predicitons for Heart, Kidney and early diabetes for dashboard

### Heart Disease

In [652]:
X = df_heart.drop(columns=['heart-disease'])
y = df_heart['heart-disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

X_test = X_test.copy()
X_test.index = df_heart.loc[X_test.index].index

In [653]:
df_heart.loc[X_test.index, 'heart_disease_prediction'] = y_pred

df_heart['disease_status'] = df_heart['heart_disease_prediction'].map({
    1: 'Absence of Heart Disease',
    2: 'Presence of Heart Disease'
})


df_heart.head(10)
df_heart.to_csv('heart_disease_predictions.csv', index=False)

Unnamed: 0,age,sex,chest-pain,rest-bp,serum-chol,fasting-blood-sugar,electrocardiographic,max-heart-rate,angina,oldpeak,slope,major-vessels,thal,heart-disease,heart_disease_prediction,disease_status
0,70.0,1.0,16.0,11.401754,5.777652,0.0,2.0,11881.0,0.0,1.223775,1.414214,1.386294,3.0,2,,
1,67.0,0.0,9.0,10.723805,6.336826,0.0,2.0,25600.0,0.0,0.955511,1.414214,0.0,7.0,1,,
2,57.0,1.0,4.0,11.135529,5.568345,0.0,0.0,19881.0,0.0,0.262364,1.0,0.0,7.0,2,,
3,64.0,1.0,16.0,11.313708,5.575949,0.0,0.0,11025.0,1.0,0.182322,1.414214,0.693147,7.0,1,2.0,Presence of Heart Disease
4,74.0,0.0,4.0,10.954451,5.598422,0.0,2.0,14641.0,1.0,0.182322,1.0,0.693147,3.0,1,,
5,65.0,1.0,16.0,10.954451,5.181784,0.0,0.0,19600.0,0.0,0.336472,1.0,0.0,7.0,1,,
6,56.0,1.0,9.0,11.401754,5.549076,0.693147,2.0,20164.0,1.0,0.470004,1.414214,0.693147,6.0,2,,
7,59.0,1.0,16.0,10.488088,5.480639,0.0,2.0,20164.0,1.0,0.788457,1.414214,0.693147,7.0,2,,
8,60.0,1.0,16.0,11.83216,5.68358,0.0,2.0,28900.0,0.0,0.788457,1.414214,1.098612,7.0,2,,
9,63.0,0.0,16.0,12.247449,6.011267,0.0,2.0,23716.0,0.0,1.609438,1.414214,1.386294,7.0,2,2.0,Presence of Heart Disease


### Chronic Kidney Diseases



In [559]:
df_kidneydiseases.loc[X_test.index, 'kidney_disease_prediction'] = y_pred_rf

print(df_kidneydiseases.head(20))
df_kidneydiseases.to_csv('updated_kidney_predictions.csv', index=False)

     age        bp        sg        al        su       bgr        bu  \
0   48.0  1.462256  0.024063  0.508398 -0.933877  2.197165  3.181531   
1    7.0  1.429473  0.024063  1.571189 -0.933877  2.228176  2.624877   
2   62.0  1.462256  0.010958  1.070153  1.077438  2.356087  3.480962   
3   48.0  1.453927  0.005233  1.571189 -0.933877  2.191759  3.522953   
4   51.0  1.462256  0.010958  1.070153 -0.933877  2.175474  2.923445   
5   60.0  1.469033  0.017219  1.375947 -0.933877  2.110725  2.891954   
6   68.0  1.453927  0.010958 -0.959685 -0.933877  2.165570  3.495235   
7   24.0  1.459520  0.017219  1.070153  1.081360  2.352978  3.063647   
8   52.0  1.474675  0.017219  1.375947 -0.933877  2.217651  3.575349   
9   53.0  1.469033  0.024063  1.070153 -0.933877  2.099895  4.005171   
10  50.0  1.443377  0.010958  1.070153  1.081360  2.370243  3.509228   
11  63.0  1.453927  0.010958  1.375947 -0.933877  2.345253  3.575349   
12  68.0  1.453927  0.017219  1.375947  1.002014  2.275347  3.71

### Breast Cancer

In [582]:
y_pred_svm.shape

(29,)

In [583]:
df_breastCancer.shape

(116, 10)

In [622]:
X_test = X_test.copy()
X_test = pd.DataFrame(X_test, index=df_breastCancer.index[:len(X_test)])

df_breastCancer.loc[X_test.index, 'breast_cancer_prediction'] = y_pred_svm

print(df_breastCancer.head(20))
df_breastCancer.to_csv('updated_breast_cancer_predictions.csv', index=False)

    Age        BMI   Glucose   Insulin      HOMA    Leptin  Adiponectin  \
0    48  23.500000  0.602905  0.479060 -0.912991  2.124182     2.191156   
1    83  20.690495  0.603097  0.491389 -0.376519  2.128146     1.646510   
2    82  23.124670  0.603091  0.519534  0.009583  2.796991     2.959658   
3    68  21.367521  0.602982  0.494324 -0.550339  2.233871     1.908616   
4    86  21.111111  0.603097  0.502043 -0.227739  1.862671     1.533427   
5    49  22.854458  0.603097  0.494324 -0.335688  1.881419     2.508677   
6    89  22.700000  0.602982  0.522384 -0.118827  1.899799     1.674098   
7    76  23.800000  0.603211  0.542163  0.548060  1.437874     2.479415   
8    73  22.000000  0.603126  0.497423 -0.233037  1.472929     2.251925   
9    75  23.000000  0.603035  0.525988  0.013701  2.753490     2.355020   
10   34  21.470000  0.602992  0.500235 -0.444966  2.601347     2.469548   
11   29  23.010000  0.603027  0.534417  0.131573  3.434963     3.117449   
12   25  22.860000  0.603