# EE0005 Group Project

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import math
sb.set()


    Age | Objective Feature | age | int (days)
    Height | Objective Feature | height | int (cm) |
    Weight | Objective Feature | weight | float (kg) |
    Gender | Objective Feature | gender | categorical code |
    Systolic blood pressure | Examination Feature | ap_hi | int |
    Diastolic blood pressure | Examination Feature | ap_lo | int |
    Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
    Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
    Smoking | Subjective Feature | smoke | binary |
    Alcohol intake | Subjective Feature | alco | binary |
    Physical activity | Subjective Feature | active | binary |
    Presence or absence of cardiovascular disease | Target Variable | cardio | binary |


In [2]:
heartdata = pd.read_csv('cardio_train.csv')
heartdata.head(n = 10000)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,14249,20427,1,166,65.0,130,80,1,1,0,0,0,0
9996,14251,18214,1,160,93.0,140,80,2,1,0,0,1,1
9997,14252,14665,1,158,66.0,130,90,2,2,0,0,1,0
9998,14253,18328,1,168,70.0,160,80,3,1,0,0,1,1


In [3]:
heartdata.shape

(70000, 13)

In [4]:
heartdata.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [5]:
heartdata['years'] = (heartdata['age']/365).round().astype('int')
years = pd.DataFrame(heartdata['years'])
years.head()

Unnamed: 0,years
0,50
1,55
2,52
3,48
4,48


In [6]:
height = pd.DataFrame(heartdata['height'])
height.head()

Unnamed: 0,height
0,168
1,156
2,165
3,169
4,156


In [7]:
weight = pd.DataFrame(heartdata['weight'])
weight.head()

Unnamed: 0,weight
0,62.0
1,85.0
2,64.0
3,82.0
4,56.0


In [8]:
heartdata['bmi'] = heartdata['weight'] / (heartdata['height']/100)**2
bmi = pd.DataFrame(heartdata['bmi'])
bmi.head()

Unnamed: 0,bmi
0,21.96712
1,34.927679
2,23.507805
3,28.710479
4,23.011177


MAP is Mean Arterial Pressure

In [9]:
heartdata['MAP'] = ((2 * heartdata['ap_lo'] + heartdata['ap_hi'])/3)
MAP = pd.DataFrame(heartdata['MAP'])
MAP.head()

Unnamed: 0,MAP
0,90.0
1,106.666667
2,90.0
3,116.666667
4,73.333333


In [109]:
newheartdata = heartdata.drop(columns = ['id', 'age'])

In [110]:
newheartdata.head()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,years,bmi,MAP
0,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,90.0
1,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,106.666667
2,1,165,64.0,130,70,3,1,0,0,0,1,52,23.507805,90.0
3,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,116.666667
4,1,156,56.0,100,60,1,1,0,0,0,0,48,23.011177,73.333333


In [132]:
newheartdata.describe()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,years,bmi,MAP
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997,53.338686,27.556513,107.359371
std,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003,6.765294,6.091511,136.49369
min,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0,30.0,3.471784,-41.666667
25%,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,48.0,23.875115,93.333333
50%,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,54.0,26.374068,93.333333
75%,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0,58.0,30.222222,103.333333
max,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0,65.0,298.666667,7400.0


In [148]:
male_df = newheartdata[newheartdata['gender'] == 1]
male_df

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,years,bmi,MAP
1,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,106.666667
2,1,165,64.0,130,70,3,1,0,0,0,1,52,23.507805,90.000000
4,1,156,56.0,100,60,1,1,0,0,0,0,48,23.011177,73.333333
5,1,151,67.0,120,80,2,2,0,0,0,0,60,29.384676,93.333333
6,1,157,93.0,130,80,3,1,0,0,1,0,61,37.729725,96.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69993,1,172,70.0,130,90,1,1,0,0,1,1,54,23.661439,103.333333
69994,1,165,80.0,150,80,1,1,0,0,1,1,58,29.384757,103.333333
69996,1,158,126.0,140,90,2,2,0,0,1,1,62,50.472681,106.666667
69998,1,163,72.0,135,80,1,2,0,0,0,1,61,27.099251,98.333333


In [150]:
male_df.drop(male_df[(male_df['ap_hi'] > male_df['ap_hi'].quantile(0.975)) | (male_df['ap_hi'] < male_df['ap_hi'].quantile(0.025))].index,inplace=True)
male_df.drop(male_df[(male_df['ap_lo'] > male_df['ap_lo'].quantile(0.975)) | (male_df['ap_lo'] < male_df['ap_lo'].quantile(0.025))].index,inplace=True)
male_df.drop(male_df[(male_df['bmi'] > male_df['bmi'].quantile(0.975)) | (male_df['bmi'] < male_df['bmi'].quantile(0.025))].index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [151]:
male_df.describe()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,years,bmi,MAP
count,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0,40554.0
mean,1.0,161.60364,71.996589,125.357425,80.75344,1.373971,1.230088,0.016916,0.024708,0.802042,0.490802,53.523721,27.615459,95.621435
std,0.0,6.591708,11.87645,13.948638,8.342884,0.687334,0.579575,0.128957,0.155235,0.398465,0.499922,6.628574,4.603173,9.425665
min,1.0,120.0,30.0,100.0,60.0,1.0,1.0,0.0,0.0,0.0,0.0,30.0,20.028842,73.333333
25%,1.0,157.0,64.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,50.0,23.951227,93.333333
50%,1.0,162.0,70.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,54.0,26.672763,93.333333
75%,1.0,166.0,80.0,130.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0,58.0,30.487805,103.333333
max,1.0,198.0,134.0,163.0,100.0,3.0,3.0,1.0,1.0,1.0,1.0,65.0,41.091387,121.0


New male_df count = 40554

In [160]:
female_df = newheartdata[newheartdata['gender'] == 2]
female_df

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,years,bmi,MAP
0,2,168,62.0,110,80,1,1,0,0,1,0,50,21.967120,90.000000
3,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,116.666667
7,2,178,95.0,130,90,3,3,0,0,1,1,62,29.983588,103.333333
11,2,173,60.0,120,80,1,1,0,0,1,0,52,20.047446,93.333333
12,2,165,60.0,120,80,1,1,0,0,0,0,41,22.038567,93.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69981,2,182,110.0,130,90,2,2,0,0,1,1,48,33.208550,103.333333
69984,2,168,80.0,120,80,1,1,0,0,1,1,49,28.344671,93.333333
69986,2,180,78.0,120,80,1,1,0,0,1,0,50,24.074074,93.333333
69995,2,168,76.0,120,80,1,1,1,0,1,0,53,26.927438,93.333333


In [162]:
female_df.drop(female_df[(female_df['ap_hi'] > female_df['ap_hi'].quantile(0.975)) | (female_df['ap_hi'] < female_df['ap_hi'].quantile(0.025))].index,inplace=True)
female_df.drop(female_df[(female_df['ap_lo'] > female_df['ap_lo'].quantile(0.975)) | (female_df['ap_lo'] < female_df['ap_lo'].quantile(0.025))].index,inplace=True)
female_df.drop(female_df[(female_df['bmi'] > female_df['bmi'].quantile(0.975)) | (female_df['bmi'] < female_df['bmi'].quantile(0.025))].index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [163]:
female_df.describe()

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,years,bmi,MAP
count,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0,21500.0
mean,2.0,170.025116,76.808688,127.48786,82.30507,1.323488,1.203256,0.214233,0.103023,0.806744,0.497116,53.119488,26.522601,97.366
std,0.0,6.651682,12.074933,13.970136,7.571438,0.64734,0.54466,0.410298,0.303996,0.394861,0.500003,6.93484,3.562157,8.928679
min,2.0,125.0,40.0,100.0,69.0,1.0,1.0,0.0,0.0,0.0,0.0,39.0,19.921875,79.333333
25%,2.0,166.0,68.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,48.0,23.875115,93.333333
50%,2.0,170.0,75.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,54.0,25.909457,93.333333
75%,2.0,174.0,85.0,140.0,90.0,1.0,1.0,0.0,0.0,1.0,1.0,58.0,28.731747,103.333333
max,2.0,198.0,128.0,170.0,100.0,3.0,3.0,1.0,1.0,1.0,1.0,65.0,36.93416,123.333333


New female_df count = 21500

In [None]:
femaleMAP = pd.DataFrame(female_df['MAP'])  # Response
femaleBMI = pd.DataFrame(female_df['bmi'])        # Predictor

In [None]:
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(femaleBMI, femaleMAP, test_size = 0.25)  #input is X value which is hp, y-value is total
#test_size fix the number you want to test, which is 1/4 of the data set which is the same 200

# Linear Regression using Train Data
linreg = LinearRegression()         # create the linear regression object
linreg.fit(X_train, y_train)        # train the linear regression model

# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'w-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'w-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()