In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Reading File

In [2]:
data = pd.read_csv("ExtraTask.csv")
data.head(20)

Unnamed: 0.1,Unnamed: 0,MaskedID,date,pocType,numericvalue,unitname,unitindice
0,0,100005,2011-09-20,Weight,82.5,kilogram,kg
1,1,100005,2011-09-20,Weight,82.5,kilogram,kg
2,2,100005,2011-11-01,Weight,81.8,kilogram,kg
3,3,100005,2011-11-01,Weight,81.8,kilogram,kg
4,4,100005,2012-02-14,Weight,81.2,kilogram,kg
5,5,100005,2012-02-14,Weight,81.2,kilogram,kg
6,6,100005,2012-05-15,Weight,81.8,kilogram,kg
7,7,100005,2012-05-15,Weight,81.8,kilogram,kg
8,8,100005,2012-06-19,Weight,81.1,kilogram,kg
9,9,100005,2012-10-09,Weight,79.2,kilogram,kg


# Dropping 'unitname' and 'unitindice' columns

In [3]:
new_data = data.drop(['unitname', 'unitindice'], axis=1)
new_data.head(30)

Unnamed: 0.1,Unnamed: 0,MaskedID,date,pocType,numericvalue
0,0,100005,2011-09-20,Weight,82.5
1,1,100005,2011-09-20,Weight,82.5
2,2,100005,2011-11-01,Weight,81.8
3,3,100005,2011-11-01,Weight,81.8
4,4,100005,2012-02-14,Weight,81.2
5,5,100005,2012-02-14,Weight,81.2
6,6,100005,2012-05-15,Weight,81.8
7,7,100005,2012-05-15,Weight,81.8
8,8,100005,2012-06-19,Weight,81.1
9,9,100005,2012-10-09,Weight,79.2


# Checking unique values in column 'pocType'

In [4]:
unique = data.pocType.unique().tolist()
unique

['Weight',
 'Heart rate',
 'Height',
 'Waist circumference',
 'BMI',
 '[POC] Hemoglobin A1c']

# Selecting co-responding row data w.r.t unique values in column 'pocType'

In [5]:
weight = new_data.loc[new_data['pocType'] == 'Weight', 'numericvalue']
weight

0      82.5
1      82.5
2      81.8
3      81.8
4      81.2
       ... 
995    71.0
996    71.0
997    70.0
998    70.0
999    70.0
Name: numericvalue, Length: 1000, dtype: float64

In [6]:
heart_rate = new_data.loc[new_data['pocType'] == 'Heart rate', 'numericvalue']
heart_rate

1000    80.0
1001    80.0
1002    76.0
1003    76.0
1004    79.0
        ... 
1995    85.0
1996    85.0
1997    90.0
1998    90.0
1999    50.0
Name: numericvalue, Length: 1000, dtype: float64

In [7]:
height = new_data.loc[new_data['pocType'] == 'Height', 'numericvalue']
height

2000    154.0
2001    154.0
2002    154.0
2003    154.0
2004    154.0
        ...  
2995    161.0
2996    161.0
2997    161.0
2998    161.0
2999    162.0
Name: numericvalue, Length: 1000, dtype: float64

In [8]:
bmi = new_data.loc[new_data['pocType'] == 'BMI', 'numericvalue']
bmi

4000    34.79
4001    34.79
4002    34.20
4003    33.40
4004    33.40
        ...  
4995    25.91
4996    35.38
4997    24.86
4998    24.86
4999    25.21
Name: numericvalue, Length: 1000, dtype: float64

In [9]:
waist_circumference = new_data.loc[new_data['pocType'] == 'Waist Circumference', 'numericvalue']
waist_circumference

Series([], Name: numericvalue, dtype: float64)

In [10]:
hemoglobin = new_data.loc[new_data['pocType'] == '[POC] Hemoglobin A1c', 'numericvalue']
hemoglobin

5000    6.2
5001    6.2
5002    7.8
5003    7.8
5004    7.8
       ... 
5995    7.8
5996    7.2
5997    7.8
5998    7.8
5999    8.1
Name: numericvalue, Length: 1000, dtype: float64

# Adding new columns

In [11]:
new_data["Weight"] = weight
new_data["Heart Rate"] = heart_rate
new_data["Height"] = height
new_data["BMI"] = bmi
new_data["Hemoglobin A1c"] = hemoglobin
new_data["Waist Circumference"] = waist_circumference

new_data

Unnamed: 0.1,Unnamed: 0,MaskedID,date,pocType,numericvalue,Weight,Heart Rate,Height,BMI,Hemoglobin A1c,Waist Circumference
0,0,100005,2011-09-20,Weight,82.5,82.5,,,,,
1,1,100005,2011-09-20,Weight,82.5,82.5,,,,,
2,2,100005,2011-11-01,Weight,81.8,81.8,,,,,
3,3,100005,2011-11-01,Weight,81.8,81.8,,,,,
4,4,100005,2012-02-14,Weight,81.2,81.2,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
5995,5995,115763,2018-02-13,[POC] Hemoglobin A1c,7.8,,,,,7.8,
5996,5996,123412,2018-02-13,[POC] Hemoglobin A1c,7.2,,,,,7.2,
5997,5997,118752,2018-02-13,[POC] Hemoglobin A1c,7.8,,,,,7.8,
5998,5998,118752,2018-02-13,[POC] Hemoglobin A1c,7.8,,,,,7.8,


# Dropping date, pocType, and numericvalue columns

In [12]:
new_data = new_data.drop(['date'], axis=1)
new_data = new_data.drop(['pocType'], axis=1)
new_data = new_data.drop(['numericvalue'], axis=1)

new_data

Unnamed: 0.1,Unnamed: 0,MaskedID,Weight,Heart Rate,Height,BMI,Hemoglobin A1c,Waist Circumference
0,0,100005,82.5,,,,,
1,1,100005,82.5,,,,,
2,2,100005,81.8,,,,,
3,3,100005,81.8,,,,,
4,4,100005,81.2,,,,,
...,...,...,...,...,...,...,...,...
5995,5995,115763,,,,,7.8,
5996,5996,123412,,,,,7.2,
5997,5997,118752,,,,,7.8,
5998,5998,118752,,,,,7.8,


# Replacing Nan cells with their mean values 

In [13]:
new_data['Weight'].fillna((new_data['Weight'].mean()), inplace=True)
new_data['Heart Rate'].fillna((new_data['Heart Rate'].mean()), inplace=True)
new_data['Height'].fillna((new_data['Height'].mean()), inplace=True)
new_data['BMI'].fillna((new_data['BMI'].mean()), inplace=True)
new_data['Hemoglobin A1c'].fillna((new_data['Hemoglobin A1c'].mean()), inplace=True)
new_data['Waist Circumference'].fillna((new_data['Waist Circumference'].mean()), inplace=True)


new_data

Unnamed: 0.1,Unnamed: 0,MaskedID,Weight,Heart Rate,Height,BMI,Hemoglobin A1c,Waist Circumference
0,0,100005,82.5000,76.553,161.5827,32.37408,8.46871,
1,1,100005,82.5000,76.553,161.5827,32.37408,8.46871,
2,2,100005,81.8000,76.553,161.5827,32.37408,8.46871,
3,3,100005,81.8000,76.553,161.5827,32.37408,8.46871,
4,4,100005,81.2000,76.553,161.5827,32.37408,8.46871,
...,...,...,...,...,...,...,...,...
5995,5995,115763,83.7471,76.553,161.5827,32.37408,7.80000,
5996,5996,123412,83.7471,76.553,161.5827,32.37408,7.20000,
5997,5997,118752,83.7471,76.553,161.5827,32.37408,7.80000,
5998,5998,118752,83.7471,76.553,161.5827,32.37408,7.80000,
