## Pandas Project 

# Step 0 : Importing libraries and dataset

In [1]:
# Import 2 libraries
import pandas as pd 
import numpy as np


In [2]:
# Import our dataset
data = pd.read_csv('./heart.csv')


# Step 1 : Understand Info contained in Data & Examinine Data for Potential Issues

In [3]:
display(data.head())


Unnamed: 0,chest pain type (4 values),age in years,sex (1 = male; 0 = female),resting blood pressure,number of major vessels,exercise induced angina,oldpeak,slope of the peak,thal,target
0,Very High,63,1,Moderate,0,0,Low,0,1,1
1,High,37,1,Low,0,0,Moderate,0,2,1
2,Moderate,41,0,Low,0,0,Low,2,2,1
3,Moderate,56,1,Low,0,0,Very Low,2,2,1
4,Low,57,0,Low,0,1,Very Low,2,2,1


In [4]:
data.describe()


Unnamed: 0,age in years,sex (1 = male; 0 = female),number of major vessels,exercise induced angina,slope of the peak,thal,target
count,301.0,301.0,301.0,301.0,301.0,301.0,301.0
mean,54.378738,0.684385,0.72093,0.328904,1.398671,2.312292,0.541528
std,9.033793,0.465534,1.007568,0.470597,0.616872,0.612766,0.499102
min,29.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,0.0,1.0,2.0,0.0
50%,55.0,1.0,0.0,0.0,1.0,2.0,1.0
75%,61.0,1.0,1.0,1.0,2.0,3.0,1.0
max,77.0,1.0,4.0,1.0,2.0,3.0,1.0


In [5]:
data.shape


(301, 10)

In [6]:
data.dtypes

chest pain type (4 values)    object
age in years                   int64
sex (1 = male; 0 = female)     int64
resting blood pressure        object
number of major vessels        int64
exercise induced angina        int64
oldpeak                       object
slope of the peak              int64
thal                           int64
target                         int64
dtype: object

# Step 2 : Cleaning Data

### Missing Values

In [7]:
# Find how prevalent missing values are in our data 
null_cols = data.isnull().sum()
null_cols[null_cols > 0]
null_cols

chest pain type (4 values)    0
age in years                  0
sex (1 = male; 0 = female)    0
resting blood pressure        0
number of major vessels       0
exercise induced angina       0
oldpeak                       0
slope of the peak             0
thal                          0
target                        0
dtype: int64

### Incorrect Values 

In [8]:
# Find any incorrect data-type
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 10 columns):
chest pain type (4 values)    301 non-null object
age in years                  301 non-null int64
sex (1 = male; 0 = female)    301 non-null int64
resting blood pressure        301 non-null object
number of major vessels       301 non-null int64
exercise induced angina       301 non-null int64
oldpeak                       301 non-null object
slope of the peak             301 non-null int64
thal                          301 non-null int64
target                        301 non-null int64
dtypes: int64(7), object(3)
memory usage: 23.6+ KB


In [9]:
# All values seem correct
# Investigate why oldpeak is float64
print(data["oldpeak"])

0           Low
1      Moderate
2           Low
3      Very Low
4      Very Low
5      Very Low
6           Low
7      Very Low
8      Very Low
9           Low
10     Very Low
11     Very Low
12     Very Low
13          Low
14     Very Low
15          Low
16     Very Low
17     Moderate
18          Low
19          Low
20     Very Low
21     Very Low
22     Very Low
23     Very Low
24          Low
25     Very Low
26          Low
27     Very Low
28     Very Low
29     Very Low
         ...   
271    Very Low
272    Very Low
273    Very Low
274         Low
275    Very Low
276    Very Low
277    Moderate
278         Low
279    Very Low
280         Low
281    Very Low
282         Low
283         Low
284    Very Low
285    Very Low
286    Moderate
287         Low
288    Very Low
289        High
290    Moderate
291    Very Low
292    Moderate
293        High
294    Very Low
295    Very Low
296    Very Low
297    Very Low
298    Moderate
299    Very Low
300    Very Low
Name: oldpeak, Length: 3

In [10]:
# Makes sense after seing printed output since heart rate is very sensitive and even decimal places matters. This ST depression induced by exercise relative to rest.

### Low Variance Columns


In [11]:
low_variance = []

for col in data._get_numeric_data():
    minimum = min(data[col])
    ninety_perc = np.percentile(data[col], 90)
    if ninety_perc == minimum:
        low_variance.append(col)

print(low_variance)


[]


### Outliers -> Percentiles


In [12]:
#Find what cols still have low variance (though not as low as before)

low_variance2 = []

for col in data._get_numeric_data():
    minimum = min(data[col])
    ninety_perc = np.percentile(data[col], 85)
    if ninety_perc == minimum:
        low_variance2.append(col)

print(low_variance2)


[]


In [13]:
# Do research understand if "fbs" is outlier or not
# 1) What is Fbs?  A test to determine how much glucose (sugar) is in a blood sample after an overnight fast. The fasting blood glucose test is commonly used to detect diabetes mellitus. A blood sample is taken in a lab, physician's office, or hospital. The test is done in the morning, before the person has eaten. 
# 2) What is a normal Fbs? The US Federal Drug Administration allows home glucose monitors to have a variance of 15% in results. That means a reading of 100 mg/dl might actually be as low as 85 mg/dl or as high as 115, a huge variation
# Conc) Maintain column since variance is accepted in general health guidelines.

### Extreme Values -> IQR


In [14]:
stats = data.describe().transpose()
stats['IQR'] = stats['75%'] - stats['25%']
stats


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
age in years,301.0,54.378738,9.033793,29.0,48.0,55.0,61.0,77.0,13.0
sex (1 = male; 0 = female),301.0,0.684385,0.465534,0.0,0.0,1.0,1.0,1.0,1.0
number of major vessels,301.0,0.72093,1.007568,0.0,0.0,0.0,1.0,4.0,1.0
exercise induced angina,301.0,0.328904,0.470597,0.0,0.0,0.0,1.0,1.0,1.0
slope of the peak,301.0,1.398671,0.616872,0.0,1.0,1.0,2.0,2.0,1.0
thal,301.0,2.312292,0.612766,0.0,2.0,2.0,3.0,3.0,1.0
target,301.0,0.541528,0.499102,0.0,0.0,1.0,1.0,1.0,1.0


In [15]:
outliers = pd.DataFrame(columns=data.columns)

for col in stats.index:
    iqr = stats.at[col,'IQR']
    cutoff = iqr * 3
    lower = stats.at[col,'25%'] - cutoff
    upper = stats.at[col,'75%'] + cutoff
    results = data[(data[col] < lower) | 
                   (data[col] > upper)].copy()
    results['Outlier'] = col
    outliers = outliers.append(results)
    
outliers

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Outlier,age in years,chest pain type (4 values),exercise induced angina,number of major vessels,oldpeak,resting blood pressure,sex (1 = male; 0 = female),slope of the peak,target,thal


In [16]:
#all but one value in outliers is fbs. Fbs stands for fasting blood sugar and data being very dispersed is normal, so we will only focus on understanding the other outlier.

In [17]:
#outlier with value col has maximum level of serum cholestoral in mg/dl in the whole dataset. It exceeds the mean (246) and distorts the data.

In [18]:
data = data.drop(index = 85)
data

Unnamed: 0,chest pain type (4 values),age in years,sex (1 = male; 0 = female),resting blood pressure,number of major vessels,exercise induced angina,oldpeak,slope of the peak,thal,target
0,Very High,63,1,Moderate,0,0,Low,0,1,1
1,High,37,1,Low,0,0,Moderate,0,2,1
2,Moderate,41,0,Low,0,0,Low,2,2,1
3,Moderate,56,1,Low,0,0,Very Low,2,2,1
4,Low,57,0,Low,0,1,Very Low,2,2,1
5,Low,57,1,Moderate,0,0,Very Low,1,1,1
6,Moderate,56,0,Moderate,0,0,Low,1,2,1
7,Moderate,44,1,Low,0,0,Very Low,2,3,1
8,High,52,1,High,0,0,Very Low,2,3,1
9,High,57,1,Moderate,0,0,Low,2,2,1


### Finding and Removing Duplicates


In [19]:
before = len(data)
data = data.drop_duplicates()
after = len(data)
print('Number of duplicate records dropped: ', str(before - after))


Number of duplicate records dropped:  4


# Step 3 : Manipulating Data

### Renaming Columns


In [20]:
data.columns


Index(['chest pain type (4 values)', 'age in years',
       'sex (1 = male; 0 = female)', 'resting blood pressure ',
       'number of major vessels', 'exercise induced angina', 'oldpeak',
       'slope of the peak', 'thal', 'target'],
      dtype='object')

In [21]:
data.columns = ['age in years', 'sex (1 = male; 0 = female)', 'chest pain type (4 values)', 'resting blood pressure ', 'chol', 'fasting blood sugar', 'restecg', 'maximum heart rate achieved','exercise induced angina', 'oldpeak', 'slope of the peak', 'number of major vessels', 'thal', 'target']

data.columns

ValueError: Length mismatch: Expected axis has 10 elements, new values have 14 elements

### Changing Column Order


In [None]:
#The "goal" field refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. The rest in logical order preferred.

column_order = ['chest pain type (4 values)','age in years', 'sex (1 = male; 0 = female)', 'resting blood pressure ','number of major vessels','exercise induced angina', 'oldpeak', 'slope of the peak','thal', 'target'] 
                

data = data[column_order]
data.head()


### Binning Numeric Variables


In [None]:
mpg_labels = ['Low', 'Moderate', 'High', 'Very High']
bins = pd.cut(data['chest pain type (4 values)'],4, labels=mpg_labels)
bins.head(10)


In [None]:
mpg_labels2 = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
bins2 = pd.cut(data['oldpeak'],5, labels=mpg_labels2)
bins2.head(10)

In [None]:
mpg_labels3 = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
bins3 = pd.cut(data['resting blood pressure '],5, labels=mpg_labels3)
bins3.head(10)

### Substituting Binned Variables


In [22]:
data['chest pain type (4 values)'] = bins
data['oldpeak'] = bins2
data['resting blood pressure '] = bins3

data.head(20)

NameError: name 'bins' is not defined

# Step 4 : Export clean CSV version of data 

In [None]:
data.to_csv('./heart.csv', index=False)
