In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [22]:
df = pd.read_csv('/content/Heart Diseases.csv')
df_chd = df[df['outcome'] != 0]

### Remove Duplicates

In [23]:
df.duplicated().sum()

7

In [None]:
df.drop_duplicates(inplace = True)
df_chd.drop_duplicates(inplace = True)

In [25]:
df.duplicated().sum()

0

### Fixing Data Entry Errors

Thalack

Edited previous values as maximum heart rate range is between 60-200 

1.42 into 142

1.71 into 171

In [26]:
df.loc[65,'thalack'] = 142
df_chd.loc[65,'thalack'] = 142
df.loc[19,'thalack'] = 171
df_chd.loc[65,'thalack'] = 142



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Imputing Missing Values

#### CA (No. of Major Vessls (0-3) colored by flourosopy)

As there's correlation between the number of major vessels coloured by flouroscopy and the number of vessels affected, we decided to impute the missing values with the mode of the outcome.

In [None]:
c = 0
for i,x in df.iterrows():
  if(x['ca'] == 1 and x['outcome'] == 0):
    c = c+1
print(c)

0


In [None]:
df['ca'] = df['ca'].replace({'?':  df['outcome'].mode()[0]})
# Covert ca into numerical column
#df['ca'] = df['ca'].astype(str).astype(int)

#### Thalack (Maximum Heart Rate)

Imputed the missing thalack value with the mode of this feature.

In [None]:
df[df['thal'] == '?']
df['thal'] = df['thal'].replace({'?':df['thal'].mode()[0]})

Coverting thalack feature from object to numerical.

In [None]:
print(df.dtypes)

ID          object
age        float64
sex          int64
cp           int64
restbps      int64
chol         int64
fbs          int64
restecg      int64
thalack     object
exang        int64
oldpeak    float64
slope        int64
ca          object
thal        object
outcome      int64
dtype: object


In [None]:
df['thal'] = df['thal'].astype(str).astype(int)

In [None]:
print(df.dtypes)

ID          object
age        float64
sex          int64
cp           int64
restbps      int64
chol         int64
fbs          int64
restecg      int64
thalack     object
exang        int64
oldpeak    float64
slope        int64
ca          object
thal         int64
outcome      int64
dtype: object


### Outlier Detection

**General detection using boxplots**

In [None]:
def add_trace(fig, df, var, row, col):

  return fig.add_trace(
      go.Box(y=df[var],
      name=var),
      row=row, col=col
  )

def add_trace_cat(fig, df, var, row, col):

  return fig.add_trace(
      go.Box(x = df[var], y = df['thalack'],
      name=var),
      row=row, col=col
  )  
  
vars = ['age', 'restbps', 'chol',
       'thalack', 'oldpeak']

vars_categorical = ['sex', 'cp', 'restecg', 'exang', 'slope', 'ca', 'thal', 'fbs', 
                    'outcome']
                    
fig = make_subplots(rows=3, cols=2, 
                     subplot_titles=(vars))

add_trace(fig, df, vars[0], 1, 1)
add_trace(fig, df, vars[1], 1, 2)
add_trace(fig, df, vars[2], 2, 1)
add_trace(fig, df, vars[3], 2, 2)
add_trace(fig, df, vars[4], 3, 1)


fig.update_layout(height=700, width=700, title = "Spread of Numerical Data in the Heart Disease Dataset")

**age**
* i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier 
*   0 54.22484 9.40275   5.7     251 5.160707   3.732797    TRUE

In [None]:
display(df[(df['age'] == 57) & (df['outcome'] == 0)])
display(df[df['age'] == 5.7])

Unnamed: 0,ID,age,sex,cp,restbps,chol,fbs,restecg,thalack,exang,oldpeak,slope,ca,thal,outcome
7,hdoamedr51tflpsut,57.0,0,4,120,354,0,0,163,1,0.6,1,0,3,0
10,hdoamedhy94tgpsut,57.0,1,4,140,192,0,0,148,0,0.4,2,0,6,0
15,hdoamed9o5zx5psut,57.0,1,3,150,168,0,0,174,0,1.6,1,0,3,0
102,hdoamed4kg12ppsut,57.0,0,4,128,303,0,2,159,0,0.0,1,1,3,0
165,hdoamed89novbpsut,57.0,1,4,132,207,0,0,168,1,0.0,1,0,7,0
202,hdoamed0mk80epsut,57.0,1,3,150,126,1,0,173,0,0.2,1,1,7,0


Unnamed: 0,ID,age,sex,cp,restbps,chol,fbs,restecg,thalack,exang,oldpeak,slope,ca,thal,outcome
250,hdoamedow89nopsut,5.7,1,4,110,201,0,0,126,1,1.5,2,0,6,0


Since there isn't any other age with a decimal value, we can possibly consider this as a typo error, when we compared it with the other patients with age 57, we found that other values lie within the normal range.

In [None]:
df.loc['age', 250] = 57

**chol**
* i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
*   1 0 257.0548 197.21856  3600      76 16.950460   3.732797    TRUE
*   2 1 246.2362  51.19214   564     153  6.207276   3.731888    TRUE
*   3 2 245.2045  47.94996   417      49  3.582807   3.730976   FALSE
*   4 3 244.6450  47.01009   409     182  3.496165   3.730060   FALSE







In [None]:
display(df[(df['chol'] > 320) & (df['chol'] < 400) & (df['outcome'] == 0)])
display(df[df['chol'] == 3600])

Unnamed: 0,ID,age,sex,cp,restbps,chol,fbs,restecg,thalack,exang,oldpeak,slope,ca,thal,outcome
7,hdoamedr51tflpsut,57.0,0,4,120,354,0,0,163,1,0.6,1,0,3,0
26,hdoamed7b8gs2psut,58.0,0,3,120,340,0,0,172,0,0.0,1,0,3,0
82,hdoamedzrdaa9psut,39.0,1,3,140,321,0,2,182,0,0.0,1,0,3,0
84,hdoamedw7ac6npsut,52.0,1,2,120,325,0,0,172,0,0.2,1,0,3,0
173,hdoamed3f58qhpsut,62.0,0,4,140,394,0,2,157,0,1.2,2,0,3,0
201,hdoamedza35zgpsut,64.0,0,4,180,325,0,0,154,1,0.0,1,0,3,0
291,hdoameds3682jpsut,55.0,0,2,132,342,0,0,166,0,1.2,1,0,3,0


Unnamed: 0,ID,age,sex,cp,restbps,chol,fbs,restecg,thalack,exang,oldpeak,slope,ca,thal,outcome
75,hdoamedv93w6dpsut,65.0,0,3,160,3600,0,2,151,0,0.8,1,0,3,0


In [None]:
df.loc['chol', 75] = 360

**restbps**
* i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
* 1 0 131.6000 17.55255   200     127 3.896870   3.732797    TRUE
* 1 131.3786 17.14214   192     189 3.536394   3.731888   FALSE
* 2 131.1818 16.81668   180      84 2.902962   3.730976   FALSE
* 3 131.0228 16.61058   180     202 2.948555   3.730060   FALSE
* 4 130.8627 16.39894   180     232 2.996367   3.729141   FALSE
* 5 130.7016 16.18152   178     184 2.922986   3.728219   FALSE

The normal range for blood pressure is 60-200, so we can consider all values as normal.



**thalack**
* i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
* 1 0 149.3516 22.88374    71     246 3.423899   3.732797   FALSE

The normal range for thalack is 60-200, so we can consider the value as normal.

**oldpeak**
* i   Mean.i     SD.i Value Obs.Num    R.i+1 lambda.i+1 Outlier
* 0 1.160000 2.210349  34.0     300 14.857380   3.732797    TRUE
* 1 1.053722 1.178415   6.2      92  4.367117   3.731888    TRUE
* 2 1.037013 1.143083   5.6     124  3.991825   3.730976    TRUE
* 3 1.022150 1.114740   5.6     309  4.106653   3.730060    TRUE
* 4 1.007190 1.085258   4.4     286  3.126272   3.729141   FALSE

The normal range for oldpeak according to https://gndec.ac.in/~jagdeepmalhi/ihdps/ is between 0-6

According to the normal range we can assume that the value 6.2 is an outlier,
while the value 34 is a data entry outlier and it's actually 3.4, as it's similar to other rows within it's range.

In [None]:
display(df.loc[(df['oldpeak'] > 3) & (df['oldpeak'] < 4) & df['outcome']])
display(df.loc[df['oldpeak'] == 34])

Unnamed: 0,ID,age,sex,cp,restbps,chol,fbs,restecg,thalack,exang,oldpeak,slope,ca,thal,outcome
6,hdoamed295p0dpsut,62.0,0,4,140,268,0,2,160,0,3.6,3,2,3,3
9,hdoamedz87ilnpsut,53.0,1,4,140,203,1,2,155,1,3.1,3,0,7,1
23,hdoamedhh06eypsut,58.0,1,3,132,224,0,2,173,0,3.2,1,2,7,3
69,hdoamed8gs278psut,46.0,1,3,150,231,0,0,147,0,3.6,2,0,3,1
235,hdoamedwz96u4psut,54.0,1,4,122,286,0,2,116,1,3.2,2,2,3,3


Unnamed: 0,ID,age,sex,cp,restbps,chol,fbs,restecg,thalack,exang,oldpeak,slope,ca,thal,outcome
299,hdoamedn9erxppsut,68.0,1,4,144,193,1,0,141,0,34.0,2,2,7,2


In [None]:
display(df.loc[df['oldpeak'] == 6.2])

Unnamed: 0,ID,age,sex,cp,restbps,chol,fbs,restecg,thalack,exang,oldpeak,slope,ca,thal,outcome
91,hdoameduv295ppsut,62.0,0,4,160,164,0,2,145,0,6.2,3,3,7,3


In [None]:
df.loc['oldpeak', 299] = 3.4
#df.loc['oldpeak', 91]