In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [2]:
# Read the data to a dataframe

cardio_df = pd.read_csv("Cardio_Data_Excel_CleanUp_1.csv")
  
cardio_df.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,42515,23232,1,140,60.0,130,80,1,1,0,0,1,0
1,85626,14472,1,140,60.0,110,70,1,1,0,0,1,0
2,49793,17535,1,140,62.0,120,80,1,1,0,0,1,1
3,60954,15941,1,140,64.0,120,80,1,1,0,0,1,0
4,1456,14722,1,140,68.0,100,70,1,1,0,0,0,0


In [5]:
# Sample a quarter of the data to run mock ML algorithms
cardio_df_sample = cardio_df.sample(frac =.25)
  
cardio_df_sample.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
7792,81974,21615,1,156,94.0,170,80,1,1,0,0,1,1
57977,20166,18890,1,179,76.0,140,90,2,2,0,0,1,1
17142,57907,22452,1,160,88.0,130,90,1,1,0,0,1,0
24864,40716,22040,2,164,67.0,140,90,1,1,0,0,1,1
12731,89934,21215,2,159,71.0,120,80,1,1,0,0,0,0


In [7]:
# Drop id column since it is redundant for our analysis
cardio_df_sample = cardio_df_sample.drop('id', axis=1)
cardio_df_sample.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
7792,21615,1,156,94.0,170,80,1,1,0,0,1,1
57977,18890,1,179,76.0,140,90,2,2,0,0,1,1
17142,22452,1,160,88.0,130,90,1,1,0,0,1,0
24864,22040,2,164,67.0,140,90,1,1,0,0,1,1
12731,21215,2,159,71.0,120,80,1,1,0,0,0,0


In [8]:
# Convert age(days) to age(years)
cardio_df_sample["age"] = round(cardio_df_sample["age"] / 365)

In [9]:
cardio_df_sample.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
7792,59.0,1,156,94.0,170,80,1,1,0,0,1,1
57977,52.0,1,179,76.0,140,90,2,2,0,0,1,1
17142,62.0,1,160,88.0,130,90,1,1,0,0,1,0
24864,60.0,2,164,67.0,140,90,1,1,0,0,1,1
12731,58.0,2,159,71.0,120,80,1,1,0,0,0,0


In [11]:
# Seperate our target(output) and features (input)

#Target variable
y = cardio_df_sample["cardio"]

#Feature variables
X = cardio_df_sample.drop(columns="cardio", axis=1)

In [12]:
X.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
7792,59.0,1,156,94.0,170,80,1,1,0,0,1
57977,52.0,1,179,76.0,140,90,2,2,0,0,1
17142,62.0,1,160,88.0,130,90,1,1,0,0,1
24864,60.0,2,164,67.0,140,90,1,1,0,0,1
12731,58.0,2,159,71.0,120,80,1,1,0,0,0


In [13]:
from collections import Counter
Counter(y)

Counter({1: 7864, 0: 7263})

In [18]:
# split the data into test and train sets (the output variable looks quite evem;y split, not sure if we need to stratify)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(11345, 11)

In [20]:
from sklearn.preprocessing import StandardScaler

# Fitting Standard Scaler
X_scaler = StandardScaler().fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [21]:
# Create a logistic regression model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [22]:
# Train our logistic regression model

model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [24]:
# Predict the model
y_pred = model.predict(X_test_scaled)
predictions = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
predictions.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,0
3,0,0
4,0,1
5,0,0
6,0,1
7,1,1
8,0,1
9,0,0


In [25]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7165520888418826


In [26]:
from sklearn.metrics import confusion_matrix, classification_report

matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1375  441]
 [ 631 1335]]


In [27]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.69      0.76      0.72      1816
           1       0.75      0.68      0.71      1966

    accuracy                           0.72      3782
   macro avg       0.72      0.72      0.72      3782
weighted avg       0.72      0.72      0.72      3782



In [28]:
# Create Random Forest classifier

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [29]:
# Train our Random forest model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [39]:
# Predict the RF model
y_pred = rf_model.predict(X_test_scaled)

In [40]:
# Put the predicted values into a dataframe
predictions = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
predictions.head(10)

Unnamed: 0,Prediction,Actual
0,1,1
1,0,1
2,1,0
3,1,0
4,0,1
5,0,0
6,0,1
7,1,1
8,0,1
9,0,0


In [41]:
print(accuracy_score(y_test, y_pred))

0.7017451084082496


In [42]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1255  561]
 [ 567 1399]]


In [43]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.69      0.69      0.69      1816
           1       0.71      0.71      0.71      1966

    accuracy                           0.70      3782
   macro avg       0.70      0.70      0.70      3782
weighted avg       0.70      0.70      0.70      3782

