In [26]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import sklearn.linear_model
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from pickle import dump

In [27]:
simple_regress = sklearn.linear_model.LinearRegression()
diabetes_df = pd.read_csv("../datasets/diabetes.csv",index_col=False)
simple_regress.fit(diabetes_df[["age"]],diabetes_df[["diabetes"]])
y_pred=simple_regress.predict(diabetes_df[["age"]])

diabetes_model = sklearn.linear_model.LogisticRegression()
diabetes_df=diabetes_df.drop(columns=["HbA1c_level","blood_glucose_level","smoking_history"])
diabetes_df["gender"]=diabetes_df["gender"].map(lambda x: 1 if x=="Male" else 0 if x=="Female" else 0.5)
cts_cols=["age","bmi"]
diabetes_scaler=preprocessing.StandardScaler().fit(diabetes_df[cts_cols])
diabetes_df[["normalized_"+var for var in cts_cols]]=diabetes_scaler.transform(diabetes_df[cts_cols])
diabetes_df.drop(columns=cts_cols,inplace=True)
diabetes_x=diabetes_df[[col for col in diabetes_df.columns if col!="diabetes"]]
diabetes_y=diabetes_df["diabetes"]
diabetes_x.to_csv("../datasets/post-processing/diabetes.csv",index=False)
diabetes_model=diabetes_model.fit(diabetes_x,diabetes_y)
with open("diabetes_model.pkl","wb") as f:
    dump(diabetes_model,f,protocol=5)

In [28]:
heart_model = sklearn.linear_model.LogisticRegression()
heart_df = pd.read_csv("../datasets/heart_disease.csv",index_col=False)
heart_df.drop(columns=["SleepTime", "GenHealth","PhysicalHealth","MentalHealth","PhysicalActivity","Race","SkinCancer","Diabetic","DiffWalking","KidneyDisease"],inplace=True)
cts_cols=[]
binary_cols=[]
for col in heart_df.columns:
    if type(heart_df[col].iloc[0])==np.float64:
        cts_cols.append(col)
    elif set(heart_df[col].unique())==set(["Yes","No"]):
        heart_df[col]=heart_df[col].map(lambda x: 1 if x=="Yes" else 0)
heart_df["Sex"]=heart_df["Sex"].map(lambda x: 1 if x=="Male" else 0)
age_map={'18-24':21,'25-29':27,'30-34':32,'35-39':37,'40-44':42,'45-49':47,'50-54':52,'55-59':57, '60-64':62 ,'65-69':67,'70-74':72,'75-79':77,'80 or older':84}
heart_df["Age"]=heart_df["AgeCategory"].map(lambda x: age_map[x])
cts_cols.append("Age")
heart_df.drop(columns=["AgeCategory"],inplace=True)
heart_scaler = preprocessing.StandardScaler().fit(heart_df[cts_cols])
heart_df[["normalized_"+var for var in cts_cols]]=heart_scaler.transform(heart_df[cts_cols])
heart_df.drop(columns=cts_cols,inplace=True)


heart_x=heart_df[[col for col in heart_df.columns if col!="HeartDisease"]]
heart_x.to_csv("../datasets/post-processing/heart.csv",index=False)
heart_y=heart_df["HeartDisease"]
heart_model.fit(heart_x,heart_y)
with open("heart_model.pkl","wb") as f:
    dump(heart_model,f,protocol=5)

In [29]:
stroke_model = sklearn.linear_model.LogisticRegression()
stroke_df = pd.read_csv("../datasets/stroke.csv",index_col=False)

stroke_df.drop(columns=["id", "work_type","ever_married","Residence_type"],inplace=True)
stroke_df["gender"]=stroke_df["gender"].map(lambda x: 0 if x=="Female" else 1 if x=="Male" else 0.5)
cts_cols=["avg_glucose_level","bmi","age"]

stroke_scaler=preprocessing.StandardScaler().fit(stroke_df[cts_cols])
stroke_df[["normalized_"+var for var in cts_cols]]=stroke_scaler.transform(stroke_df[cts_cols])
stroke_df.drop(columns=cts_cols,inplace=True)
stroke_df["ever_smoked"]=stroke_df["smoking_status"].map(lambda x: np.nan if x=='Unknown' else 1 if x=="smokes" or x=="formerly smokes" else 0)
stroke_df["current_smoker"]=stroke_df["smoking_status"].map(lambda x: np.nan if x=='Unknown' else 1 if x=="smokes" else 0)
stroke_df.drop(columns=["smoking_status"], inplace=True)
stroke_df[:]=SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(stroke_df)
stroke_x=stroke_df[[col for col in stroke_df.columns if col!="stroke"]]
stroke_x.to_csv("../datasets/post-processing/stroke.csv",index=False)
stroke_y=stroke_df["stroke"]
stroke_model.fit(stroke_x,stroke_y)
with open("stroke_model.pkl","wb") as f:
    dump(stroke_model,f,protocol=5)


In [30]:
alzheimers_model = sklearn.linear_model.LogisticRegression()
alzheimers_df = pd.read_csv("../datasets/alzheimers.csv",index_col=False)
alzheimers_df=alzheimers_df[["Diagnosis","Age","SystolicBP","DiastolicBP","CholesterolTotal"]]
cts_cols = ["Age","SystolicBP","DiastolicBP","CholesterolTotal"]
alzheimers_scaler=preprocessing.StandardScaler().fit(alzheimers_df[cts_cols])
alzheimers_df[["normalized_"+var for var in cts_cols]]=alzheimers_scaler.transform(alzheimers_df[cts_cols])
alzheimers_df.drop(columns=cts_cols,inplace=True)
alzheimers_x=alzheimers_df[[col for col in alzheimers_df.columns if col!="Diagnosis"]]
alzheimers_x.to_csv("../datasets/post-processing/alzheimers.csv",index=False)
# display(alzheimers_x)
alzheimers_y=alzheimers_df["Diagnosis"]
alzheimers_model.fit(alzheimers_x,alzheimers_y)
with open("alzheimers_model.pkl","wb") as f:
    dump(alzheimers_model,f,protocol=5)