# Formatting and cleaning

## 1. Load basic modules and the csv file 

In [1]:
import pandas as pd
import numpy as np
import random as rd

data = pd.read_csv('../data_raw/Health_Screening_Data.csv')
data.drop(columns=['Unnamed: 0', 'id'],inplace=True)

In [2]:
data.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,AgeinYr,BMI,BMICat,AgeGroup
0,18393,2,168,62.0,110,80,1,1,0,0,1,0,50,22.0,Normal,40-60
1,20228,1,156,85.0,140,90,3,1,0,0,1,1,55,34.9,Obese,40-60
2,18857,1,165,64.0,130,70,3,1,0,0,0,1,51,23.5,Normal,40-60
3,17623,2,169,82.0,150,100,1,1,0,0,1,1,48,28.7,Over Weight,40-60
4,17474,1,156,56.0,100,60,1,1,0,0,0,0,47,23.0,Normal,40-60


## 2. Importing functions from Cleaning_functions.ipynb

In [3]:
import sys
sys.path.append('../src/')
import Cleaning_functions as cf

## 3. Categorical variables

### 3.1 Gender

**The "gender" column is categorized into "f" and "m".**

In [4]:
data[["gender"]].head(5)

Unnamed: 0,gender
0,2
1,1
2,1
3,2
4,1


**The correction is made to categorize 1='m' and 2='f'**

In [5]:
data = cf.correct_gender(data)

In [6]:
data.gender.value_counts()

m    45502
f    24458
Name: gender, dtype: int64

### 3.2 Cholesterol, Gluc, Smoke, Alco, Active

**The  variables are coded as {0 1 2 3} as the case may be.**

In [7]:
categorical_variables = ["cholesterol", "gluc", "smoke", "alco", "active"]
data[categorical_variables].head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active
0,1,1,0,0,1
1,3,1,0,0,1
2,3,1,0,0,0
3,1,1,0,0,1
4,1,1,0,0,0


**The value of the categorical variables is mapped to their corresponding values.**

In [8]:
info = {'cholesterol': {1:'normal', 2:'bordering', 3:'alto'},
        'gluc': {1:'normal', 2: 'prediabetes', 3: 'diabetes'},
        'smoke':{1:'yes', 0:'no'},
        'alco':{1:'yes', 0:'no'},
        'active':{1:'yes', 0:'no'}}

In [9]:
data = cf.correct_categorical(data)

In [10]:
categorical_variables = ["cholesterol", "gluc", "smoke", "alco", "active"]
data[categorical_variables].head()

Unnamed: 0,cholesterol,gluc,smoke,alco,active
0,normal,normal,no,no,yes
1,high,normal,no,no,yes
2,high,normal,no,no,no
3,normal,normal,no,no,yes
4,normal,normal,no,no,no


## 4. Continuous variables

### 4.1 Blood Pressure

**Abnormal systolic blood pressure values are corrected and imputed.**

In [11]:
data = cf.systolic(data, "ap_hi", "ap_lo")

**Diastolic pressure values containing between 4 and 5 digits are eliminated due to excess zeros (they are interpreted as typing errors).**

In [12]:
data = cf.remove_zeros(data, "ap_lo")

**Abnormal dyastolic blood pressure values are corrected and imputed.**

In [13]:
data = cf.dyastolic(data, "ap_hi", "ap_lo")

**Systolic blood pressure should be higher than diastolic. A check is made to ensure that this condition is met. If it is not met, then the values are reversed.**

In [14]:
data = cf.check_bp(data, "ap_hi", "ap_lo")

**We consider the upper limit of diastolic pressure to be 110. Diastolic pressure values greater than 110 will be replaced by 110.**

In [15]:
data = cf.dyastolic_final(data, "ap_lo")

**We find NaN values in ap_hi variable, we decide to erase those rows.**

In [16]:
data.dropna(inplace = True) 

### 4.2 Age

**The "age" and "AgeGroup" columns are removed as they are not considered useful. In turn, the "AgeCat" column is generated.**

In [17]:
data = cf.correct_age(data)

In [18]:
data[["AgeinYr", "AgeCat"]].head()

Unnamed: 0,AgeinYr,AgeCat
0,50,"[50, 60)"
1,55,"[50, 60)"
2,51,"[50, 60)"
3,48,"[40, 50)"
4,47,"[40, 50)"


### 4.3 Height

**Removing outliers from the "height" column.**

In [19]:
print(data["height"].max())
print(data["height"].min())

250
55


In [20]:
data = cf.remove_outliers(data, "height", 150, 200)

Shape of the initial dataframe: (69949, 15)
Shape of the dataframe after removing outliers: (67359, 15)


In [21]:
print(data["height"].max())
print(data["height"].min())

198
151


### 4.4 Weight

In [22]:
print(data["weight"].max())
print(data["weight"].min())

200.0
10.0


In [23]:
data = cf.correct_weight(data)

In [24]:
print(data["weight"].max())
print(data["weight"].min())

200.0
50.0


### 4.5 BMI

**Since the weight and height columns were corrected, the BMI variable is recalculated as well as its categorization**

In [25]:
data = cf.create_BMI(data)

In [26]:
data[["bmi", "bmi_cat"]].head()

Unnamed: 0,bmi,bmi_cat
0,21.97,Normal Weight
1,34.93,Obesity
2,23.51,Normal Weight
3,28.71,Overweight
4,23.01,Normal Weight


### 4.6 Framingham Score

In [27]:
def framingham(row):
    if row['gender'] == 'm':
        return cf.framingham_men(row)
    else:
        return cf.framingham_women(row)

In [28]:
data['fram'] = data.apply(framingham, axis=1)

### 4.7 European Score

In [29]:
table = cf.create_table_ESC()
data['esc'] = data.apply(cf.esc_score, args = (table,), axis = 1)

## 5. Saving to data_clean.csv

In [30]:
data.to_csv('../documents/data_clean.csv', index=False)

## 6. Saving to data_to_model.csv

**The columns that will not be used as a source of information for the models are eliminated and data_to_model.csv is generated**

In [33]:
data_model = data.drop(columns=['height', 'weight', 'AgeCat', 'bmi_cat'])
data_model.head()
data_model.to_csv('../documents/data_to_model.csv', index=False)