## Summary

We need to handle the test dataset in the same way as we handled the training dataset. 

So first we wrangle.

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

from wrangle_helper import split_row, split_tup, swap_job_living, swap_sex_age

%matplotlib inline

In [2]:
stk = pd.read_csv("../data/test.csv")
stk.head()

Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,TreatmentB,TreatmentC,TreatmentD
0,33327,"F, 36",0.0,0.0,1.0,private_sector?Remote,76.05,33.4,active_smoker,,,,
1,839,"F, 40",0.0,0.0,1.0,City?government,73.77,30.1,non-smoker,,,,
2,11127,"M, 59",0.0,0.0,1.0,business_owner?Remote,62.95,30.8,,,,,
3,20768,"33, F",0.0,0.0,1.0,private_sector?City,68.81,36.5,quit,,,,
4,37774,"F, 22",0.0,0.0,0.0,private_sector?City,122.89,30.8,active_smoker,,,,


In [3]:
stk.shape

(8718, 13)

<div class="alert alert-block alert-warning">
Highly Imbalanced Dataset!
</div>

In [4]:
stk.columns

Index(['id', 'sex and age', 'high_BP', 'heart_condition_detected_2017',
       'married', 'job_status and living_area', 'average_blood_sugar', 'BMI',
       'smoker_status', 'TreatmentA', 'TreatmentB', 'TreatmentC',
       'TreatmentD'],
      dtype='object')

In [5]:
stk.shape

(8718, 13)

In [6]:
stk["job_status and living_area"].unique()

array(['private_sector?Remote', 'City?government',
       'business_owner?Remote', 'private_sector?City',
       'business_owner?City', 'government?City', 'City?private_sector',
       'parental_leave?Remote', 'government?Remote',
       'Remote?private_sector', 'parental_leave?City',
       'City?parental_leave', 'Remote?parental_leave',
       'Remote?business_owner', 'unemployed?Remote',
       'City?business_owner', 'Remote?government', 'goVT.?Remote',
       '?City', 'remote?private_sector', 'GOVERNMENT?Remote',
       'parental_leave?remote', 'City?unemployed',
       'private_sector?remote', 'private_sector???', 'unemployed?City',
       nan, 'govt.?Remote', 'BUSINESS OWNER?City', 'private sector?City',
       'biz?City', 'city?private_sector', 'private_sector?CITY',
       'Private?Remote', 'r?private_sector', 'c?business_owner',
       'governMENT?City', 'City?GOVT.', 'Remote?unemployed',
       'business owner?Remote'], dtype=object)

In [7]:
stk["sex and age"].unique()

array(['F, 36', 'F, 40', 'M, 59', '33, F', 'F, 22', 'M, 60', 'F, 83',
       'M, 47', 'F, 82', 'F, 49', 'M, 58', 'M, 82', 'M, 4.88', 'F, 38',
       'M, 50', 'F, 84', 'F, 68', 'M, 40', 'M, 52', 'M, 62', 'M, 35',
       '32, F', 'M, 14', 'F, 35', 'F, 43', 'M, 43', 'M, 12', 'F, 55',
       'F, 33', 'F, 30', '36, M', 'F, 56', 'F, 78', 'M, 69', 'M, 48',
       'F, 64', 'F, 16', 'F, 63', 'F, 58', 'M, 26', 'F, 29', 'F, 61',
       'M, 79', 'M, 20', 'F, 5', 'F, 42', 'F, 72', 'F, 48', 'M, 73',
       'M, 21', 'M, 55', 'M, 34', 'F, 27', 'F, 41', 'F, 52', 'M, 31',
       'F, 47', 'F, 18', 'M, 85', 'M, 78', 'M, 3.72', 'M, 51', '50, M',
       'F, 4.08', 'F, 71', '63, F', 'F, 76', 'M, 45', 'M, 67', 'F, 60',
       'M, 19', 'F, 45', 'M, 17', 'F, 46', 'M, 7', 'F, 62', 'M, 65',
       'F, 66', 'F, 81', 'F, 50', '65, F', 'M, 54', 'F, 39', 'F, 3.32',
       'M, 6', 'F, 53', 'F, 28', 'F, 37', 'F, 57', 'F, 24', 'F, 65',
       'F, 26', 'F, 20', 'M, 8', 'F, 32', 'F, 80', '51, F', 'F, 4',
       'F, 59', '

Looks like there are some variables that need to be wrangled.

### Data Wrangling

## Wrangling Begins Here

<div class="alert alert-block alert-info">
Check Job status and living area
</div>

In [8]:
stk["job_status_living_area"] = stk["job_status and living_area"].apply(split_row, args="?")
stk["job_status_living_area"].head()

0    (private_sector, Remote)
1          (City, government)
2    (business_owner, Remote)
3      (private_sector, City)
4      (private_sector, City)
Name: job_status_living_area, dtype: object

In [9]:
stk['job_status'], stk['living_area'] = zip(*stk['job_status_living_area'].map(split_tup))
stk[["job_status_living_area", "job_status", "living_area"]].head()

Unnamed: 0,job_status_living_area,job_status,living_area
0,"(private_sector, Remote)",private_sector,Remote
1,"(City, government)",City,government
2,"(business_owner, Remote)",business_owner,Remote
3,"(private_sector, City)",private_sector,City
4,"(private_sector, City)",private_sector,City


---
__Living Area__
---

Make everything lowercase, remove weird variables, and correct spelling

In [10]:
stk["living_area"] = stk["living_area"].apply(lambda x: x.lower())

In [11]:
print(stk["living_area"].value_counts())
print(stk["living_area"].unique())

city              3756
remote            3738
private_sector     719
business_owner     179
parental_leave     170
government         138
a                   10
unemployed           6
govt.                1
                     1
Name: living_area, dtype: int64
['remote' 'government' 'city' 'private_sector' 'parental_leave'
 'business_owner' 'unemployed' '' 'a' 'govt.']


In [12]:
stk.loc[stk["living_area"] == "privattte", "living_area"] = "private_sector"
print(stk["living_area"].value_counts())

city              3756
remote            3738
private_sector     719
business_owner     179
parental_leave     170
government         138
a                   10
unemployed           6
govt.                1
                     1
Name: living_area, dtype: int64


In [13]:
stk[stk["living_area"] == "government"]

Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,TreatmentB,TreatmentC,TreatmentD,job_status_living_area,job_status,living_area
1,839,"F, 40",0.0,0.0,1.0,City?government,73.77,30.1,non-smoker,,,,,"(City, government)",City,government
20,2813,"M, 62",0.0,0.0,1.0,City?government,74.64,33.7,non-smoker,,,,,"(City, government)",City,government
122,24579,"M, 52",0.0,0.0,0.0,City?government,80.10,24.8,active_smoker,,,,,"(City, government)",City,government
159,24924,"F, 78",1.0,0.0,1.0,City?government,97.31,28.8,non-smoker,,,,,"(City, government)",City,government
172,625,"M, 44",0.0,0.0,,City?government,111.58,29.9,non-smoker,,,,,"(City, government)",City,government
208,2247,"F, 45",0.0,0.0,1.0,City?government,61.31,27.5,,,,,,"(City, government)",City,government
222,2181,"F, 72",1.0,0.0,1.0,City?government,116.80,,non-smoker,,,,,"(City, government)",City,government
288,2874,"M, 63",1.0,0.0,1.0,City?government,217.63,22.5,active_smoker,,,,,"(City, government)",City,government
315,3030,"F, 27",0.0,0.0,0.0,City?government,109.46,22.1,non-smoker,,,,,"(City, government)",City,government
520,24156,"M, 68",0.0,1.0,1.0,Remote?government,203.59,31.5,non-smoker,0.0,0.0,0.0,0.0,"(Remote, government)",Remote,government


<div class="alert alert-block alert-warning">
It appears that living_area and job_status are sometimes filled in the wrong columns
</div>

Hence, we need to write a function to swap them.

But before we do that, we designate the variables we want to keep.

In [14]:
living_area_keep = ["city", "remote", "private_sector", "business_owner",
                    "parental_leave", "government", "unemployed"]

In [15]:
stk = stk[(stk["living_area"].isin(living_area_keep))]
stk.shape

(8706, 16)

---
__Job status__
---

We take a similar approach to the `job_status`.

In [16]:
stk["job_status"] = stk["job_status"].apply(lambda x: x.lower())

In [17]:
print(stk["job_status"].value_counts())
print(stk["job_status"].unique())

private_sector    4293
business_owner    1143
parental_leave    1085
government         933
city               621
remote             589
unemployed          28
private sector       3
                     3
business owner       2
govt.                2
r                    1
private              1
biz                  1
c                    1
Name: job_status, dtype: int64
['private_sector' 'city' 'business_owner' 'government' 'parental_leave'
 'remote' 'unemployed' 'govt.' '' 'business owner' 'private sector' 'biz'
 'private' 'r' 'c']


In [18]:
stk.loc[stk["job_status"] == "parental leave", "job_status"] = "parental_leave"
stk.loc[stk["job_status"] == "private sector", "job_status"] = "private_sector"
stk.loc[stk["job_status"] == "private", "job_status"] = "private_sector"
stk.loc[stk["job_status"] == "govt.", "job_status"] = "government"
stk.loc[stk["job_status"] == "remotee", "job_status"] = "remote"
print(stk["job_status"].value_counts())

private_sector    4297
business_owner    1143
parental_leave    1085
government         935
city               621
remote             589
unemployed          28
                     3
business owner       2
r                    1
biz                  1
c                    1
Name: job_status, dtype: int64


In [19]:
# Variables to be dropped
job_status_keep = ["private_sector", "business_owner", "parental_leave", 
                   "government", "city", "remote", "unemployed"]

In [20]:
stk = stk[(stk["job_status"].isin(job_status_keep))]
stk.shape

(8698, 16)

__Now let's swap the variables around__

In [21]:
job_status_correct = ["private_sector", "business_owner", "parental_leave", 
                   "government", "unemployed"]
living_area_correct = ["city", "remote"]

In [22]:
stk["job_status_corr"], stk["living_area_corr"] = zip(*stk.apply(lambda x:\
            swap_job_living(x.job_status, x.living_area,
                           job_status_correct, living_area_correct), axis=1))

job: private_sector
area: remote
don't swap
job: city
area: government
swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: business_owner
area: city
don't swap
job: business_owner
area: city
don't swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: city
area: private_sector
swap
job: business_owner
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: remote
area: private_sector
swap
job: government
area: city
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: city
don't swap
job: city
area: government
swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: parental_leave
area: city
d

area: remote
don't swap
job: city
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: government
area: city
don't swap
job: city
area: private_sector
swap
job: government
area: remote
don't swap
job: remote
area: private_sector
swap
job: government
area: city
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: government
area: remote
don't swap
job: business_owner
area: city
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: city
area: government
swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't

area: city
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: city
don't swap
job: government
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: city
don't swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: city
area: government
swap
job: business_owner
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: parental_leave
area: remote
don't swap
job: city
area: private_sector
swap
job: government
area: city
don't swap
job: city
area: business_o

job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: city
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: remote
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: city
don't swap
job: government
area: remote
don't swap
job: city
area: private_sector
swap
job: remote
area: private_sector
swap
job: parental_leave
area: city
don't swap
job: parental_leave
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: city
don't swap
job: city
area: government
swap
job: parental_leave
area: city
don't swap
job: business_owner


area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: business_owner
area: remote
don't swap
job: city
area: parental_leave
swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: government
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: city
don't swap
job: priva

don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: city
don't swap
job: city
area: private_sector
swap
job: remote
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: remote
area: government
swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: remote
area: parental_leave
swap
job: remote
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: city
area: business_owner
swap
job: business_owner
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: priv

area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: city
area: parental_leave
swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: remote
area: parental_leave
swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: paren

job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: remote
area: private_sector
swap
job: business_owner
area: city
don't swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: remote
area: business_owner
swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: government
area: city
don't swap
job: government
area: remote
don't swap
jo

area: remote
don't swap
job: private_sector
area: city
don't swap
job: city
area: private_sector
swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: city
area: government
swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: government
area: city
don't swap
job: remote
area: government
swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: remote
area: private_sector
swap
jo

area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: remote
don't swap
job: city
area: private_sector
swap
job: remote
area: parental_leave
swap
job: government
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: city
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: city
area: parental_leave
swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: parental_leave
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: city
area: private_sector
swap
job: private_sector
area: city
don't swap
job: business_owner
area: remote
don't swap
job: government
area: remote
don't swap
job: government
area: city
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: city

don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: remote
area: private_sector
swap
job: parental_leave
area: city
don't swap
job: government
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: city
don't swap
job: city
area: parental_leave
swap
job: city
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: city
area: parental_leave
swap
job: private_sector
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: government
area: city
don't swap
job: parental_leave
area: city
don't sw

area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: city
don't swap
job: city
area: business_owner
swap
job: parental_leave
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: government
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: parental_leave
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: remote
area: business_owner
swap
job: private_sector
area: remote
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: parental_leav

job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: city
don't swap
job: remote
area: business_owner
swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: government
area: remote
don't swap
job: city
area: private_sector
swap
job: parental_leave
area: city
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
d

area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: business_owner
area: city
don't swap
job: parental_leave
area: city
don't swap
job: government
area: remote
don't swap
job: government
area: city
don't swap
job: business_owner
area: remote
don't swap
job: government
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: remote
area: government
swap
job: private_sector
area: city
don't swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: re

area: remote
don't swap
job: parental_leave
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: city
area: business_owner
swap
job: city
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: remote
area: private_sector
swap
job: remote
area: business_owner
swap
job: parental_leave
area: remote
don't swap
job: government
area: city
don't swap
job: parental_leave
area: city
don't swap
job: city
area: private_sector
swap
job: government
area: remote
don't swap
job: private_sector
area: city
don't swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: business_owner
area: city
d

don't swap
job: business_owner
area: city
don't swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: parental_leave
area: city
don't swap
job: parental_leave
area: city
don't swap
job: remote
area: private_sector
swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: remote
don't swap
job: remote
area: parental_leave
swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city


area: private_sector
swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: private_sector
area: city
don't swap
job: city
area: private_sector
swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: city
area: private_sector
swap
job: government
area: city
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: city
area: parental_leave
swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: city
area: business_owner
swap
job: remote
area: parental_leave
swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remo

area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: city
don't swap
job: government
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: city
don't swap
job: parental_leave
area: city
don't swap
job: city
area: private_sector
swap
job: parental_leave
area: remote
don't swap
job: business_owner
area: remote
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: government
area: remote
don't swap
job: business_owner
area: city
don't swap
job: parental_leave
area: city
don't swap
job: city
area: business_owner
swap
job: unemployed
area: city
don't swap
job: parental_leave
area: city
don't swap
job: private_sector
area: 

area: city
don't swap
job: government
area: city
don't swap
job: business_owner
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: private_sector
area: city
don't swap
job: remote
area: private_sector
swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: private_sector
area: city
don't swap
job: parental_leave
area: remote
don't swap
job: city
area: private_sector
swap
job: remote
area: private_sector
swap
job: city
area: private_sector
swap
job: government
area: remote
don't swap
job: private_sector
area: remote
don't swap
job: business_owner
area: city
don't swap
job: city
area: private_sector
swap
job: parental_leave
area: remote
don't swap
job: parental_leave
area: city
don

In [23]:
print(stk["job_status_corr"].value_counts())
print(stk["living_area_corr"].value_counts())

private_sector    5015
business_owner    1321
parental_leave    1255
government        1073
unemployed          34
Name: job_status_corr, dtype: int64
city      4372
remote    4326
Name: living_area_corr, dtype: int64


### Sex and Age

<div class="alert alert-block alert-info">
Sex and Age variable is a little messy. Let's clean this up.
</div>

In [24]:
stk["sex and age"].unique()

array(['F, 36', 'F, 40', 'M, 59', '33, F', 'F, 22', 'M, 60', 'F, 83',
       'M, 47', 'F, 82', 'F, 49', 'M, 58', 'M, 82', 'M, 4.88', 'F, 38',
       'M, 50', 'F, 84', 'F, 68', 'M, 40', 'M, 52', 'M, 62', 'M, 35',
       '32, F', 'M, 14', 'F, 35', 'F, 43', 'M, 43', 'M, 12', 'F, 55',
       'F, 33', 'F, 30', '36, M', 'F, 56', 'F, 78', 'M, 69', 'M, 48',
       'F, 64', 'F, 16', 'F, 63', 'F, 58', 'M, 26', 'F, 29', 'F, 61',
       'M, 79', 'M, 20', 'F, 5', 'F, 42', 'F, 72', 'F, 48', 'M, 73',
       'M, 21', 'M, 55', 'M, 34', 'F, 27', 'F, 41', 'F, 52', 'M, 31',
       'F, 47', 'F, 18', 'M, 85', 'M, 78', 'M, 3.72', 'M, 51', '50, M',
       'F, 4.08', 'F, 71', '63, F', 'F, 76', 'M, 45', 'M, 67', 'F, 60',
       'M, 19', 'F, 45', 'M, 17', 'F, 46', 'M, 7', 'F, 62', 'M, 65',
       'F, 66', 'F, 81', 'F, 50', '65, F', 'M, 54', 'F, 39', 'F, 3.32',
       'M, 6', 'F, 53', 'F, 28', 'F, 37', 'F, 57', 'F, 24', 'F, 65',
       'F, 26', 'F, 20', 'M, 8', 'F, 32', 'F, 80', '51, F', 'F, 4',
       'F, 59', '

In [25]:
stk["sex_and_age"] = stk["sex and age"].apply(split_row, args=",")
stk["sex"], stk["age"] = zip(*stk['sex_and_age'].map(split_tup))
stk.head()

Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,...,TreatmentC,TreatmentD,job_status_living_area,job_status,living_area,job_status_corr,living_area_corr,sex_and_age,sex,age
0,33327,"F, 36",0.0,0.0,1.0,private_sector?Remote,76.05,33.4,active_smoker,,...,,,"(private_sector, Remote)",private_sector,remote,private_sector,remote,"(F, 36)",F,36
1,839,"F, 40",0.0,0.0,1.0,City?government,73.77,30.1,non-smoker,,...,,,"(City, government)",city,government,government,city,"(F, 40)",F,40
2,11127,"M, 59",0.0,0.0,1.0,business_owner?Remote,62.95,30.8,,,...,,,"(business_owner, Remote)",business_owner,remote,business_owner,remote,"(M, 59)",M,59
3,20768,"33, F",0.0,0.0,1.0,private_sector?City,68.81,36.5,quit,,...,,,"(private_sector, City)",private_sector,city,private_sector,city,"(33, F)",33,F
4,37774,"F, 22",0.0,0.0,0.0,private_sector?City,122.89,30.8,active_smoker,,...,,,"(private_sector, City)",private_sector,city,private_sector,city,"(F, 22)",F,22


### Sex Variable

Now let's deal with the `sex` variable separately

In [26]:
stk["sex"] = stk["sex"].apply(lambda x: x.lower())

print(stk["sex"].value_counts())
print(stk["sex"].unique())

stk.loc[stk["sex"] == "male", "sex"] = "m"
stk.loc[stk["sex"] == "mmale", "sex"] = "m"
stk.loc[stk["sex"] == "mm", "sex"] = "m"
stk.loc[stk["sex"] == "female", "sex"] = "f"
stk.loc[stk["sex"] == "femalle", "sex"] = "f"

f         4808
m         3275
39          16
64          16
5           15
63          14
67          14
55          14
52          12
24          12
44          11
43          11
22          11
42          11
57          10
54          10
53          10
81          10
65          10
23          10
68          10
27           9
61           9
32           9
59           9
51           9
46           9
17           9
n            9
69           9
          ... 
70           4
9            4
other        4
49           4
29           4
7            4
18           3
26           3
74           3
28           3
8            3
20           3
male         3
66           3
79           2
3.64         2
3.8          2
73           2
13           2
4.72         1
4.48         1
4.4          1
female       1
3.56         1
3.08         1
4.24         1
3.88         1
4.56         1
4            1
3.72         1
Name: sex, Length: 99, dtype: int64
['f' 'm' '33' '32' '36' '50' '63' '65' '51' '35' 

In [27]:
stk[stk["sex"] == "n"][["sex and age", "sex", "age"]].head()

Unnamed: 0,sex and age,sex,age
1663,,n,a
2253,,n,a
2257,,n,a
2503,,n,a
2943,,n,a


In [28]:
sex_not_keep = ["", "other", "n"] 

# We can't remove the unwanted numbers out until we swap them with age
# So we instead filter those that we don't want to keep. 

In [29]:
stk = stk[~stk["sex"].isin(sex_not_keep)]
stk.shape

(8685, 21)

### Age Variable

In [30]:
stk["age"] = stk["age"].apply(lambda x: x.lower())
stk["age"] = stk["age"].apply(lambda x: x.strip())

print(stk["age"].value_counts())
print(stk["age"].unique())

f              370
m              228
54             150
55             145
50             143
48             139
81             139
53             135
47             134
59             131
62             130
63             129
49             129
43             126
51             125
52             121
58             120
44             120
56             120
60             119
61             116
45             115
64             113
57             113
41             113
42             111
66             110
8              108
34             107
36             104
              ... 
77              47
3.56            15
3.32            13
3.64            13
3.72            12
3.24            12
4.08            12
3.8             12
4.56            12
4.4             11
3.88            11
3.48            11
4.88            10
4.32            10
4.24             9
4.64             9
4.72             9
4.16             8
4.8              8
4                7
4.48             5
3.4         

In [31]:
stk.loc[stk["age"] == 'sixteen', "age"] = '16'
stk.loc[stk["age"] == 'eleven', "age"] = '11'
stk.loc[stk["age"] == 'fifty', "age"] = '50'
stk.loc[stk["age"] == 'seventy seven', "age"] = '77'
stk.loc[stk["age"] == 'sixteen', "age"] = '16'
stk.loc[stk["age"] == 'forty', "age"] = '40'
stk.loc[stk["age"] == 'sixx', "age"] = '6'
stk.loc[stk["age"] == 'eight', "age"] = '8'
stk.loc[stk["age"] == 'seven', "age"] = '7'
stk.loc[stk["age"] == 'eighty fivee', "age"] = '85'
stk.loc[stk["age"] == 'six', "age"] = '6'
stk.loc[stk["age"] == 'thirtythree', "age"] = '33'
stk.loc[stk["age"] == 'fifteen', "age"] = '15'
stk.loc[stk["age"] == 'sixty-seven', "age"] = '67'
stk.loc[stk["age"] == 'eight', "age"] = '8'
stk.loc[stk["age"] == '50', "age"] = '50'
stk.loc[stk["age"] == 'sixty three', "age"] = '63'
stk.loc[stk["age"] == 'twenty two', "age"] = '22'

In [32]:
stk.columns

Index(['id', 'sex and age', 'high_BP', 'heart_condition_detected_2017',
       'married', 'job_status and living_area', 'average_blood_sugar', 'BMI',
       'smoker_status', 'TreatmentA', 'TreatmentB', 'TreatmentC', 'TreatmentD',
       'job_status_living_area', 'job_status', 'living_area',
       'job_status_corr', 'living_area_corr', 'sex_and_age', 'sex', 'age'],
      dtype='object')

In [33]:
print(stk["age"].value_counts())
print(stk["age"].unique())

f       370
m       228
54      150
55      145
50      144
81      139
48      139
53      135
47      134
59      131
63      130
62      130
49      129
43      126
51      125
52      121
56      120
44      120
58      120
60      119
61      116
45      115
41      113
57      113
64      113
42      111
66      110
8       109
34      107
67      104
       ... 
79       61
76       57
10       56
7        56
14       54
9        54
77       47
3.56     15
3.32     13
3.64     13
3.72     12
3.24     12
4.08     12
4.56     12
3.8      12
3.48     11
4.4      11
3.88     11
4.32     10
4.88     10
4.64      9
4.72      9
4.24      9
4.16      8
4.8       8
4         7
4.48      5
3.4       5
3.16      4
3.08      3
Name: age, Length: 106, dtype: int64
['36' '40' '59' 'f' '22' '60' '83' '47' '82' '49' '58' '4.88' '38' '50'
 '84' '68' '52' '62' '35' '14' '43' '12' '55' '33' '30' 'm' '56' '78' '69'
 '48' '64' '16' '63' '26' '29' '61' '79' '20' '5' '42' '72' '73' '21' '34'
 '27' '41

In [34]:
age_not_keep = ['', 'other', 'a']

In [35]:
stk = stk[~stk["age"].isin(age_not_keep)]
stk.shape

(8685, 21)

In [36]:
sex_correct = ["f", "m"]

stk["sex_corr"], stk["age_corr"] = zip(*stk.apply(lambda x:\
            swap_sex_age(x.sex, x.age, sex_correct), axis=1))

sex: f
age: 36
don't swap
sex: f
age: 40
don't swap
sex: m
age: 59
don't swap
sex: 33
age: f
swap
sex: f
age: 22
don't swap
sex: m
age: 60
don't swap
sex: f
age: 83
don't swap
sex: m
age: 47
don't swap
sex: f
age: 82
don't swap
sex: f
age: 49
don't swap
sex: m
age: 58
don't swap
sex: m
age: 82
don't swap
sex: m
age: 4.88
don't swap
sex: m
age: 58
don't swap
sex: f
age: 38
don't swap
sex: m
age: 50
don't swap
sex: f
age: 84
don't swap
sex: f
age: 68
don't swap
sex: m
age: 40
don't swap
sex: m
age: 52
don't swap
sex: m
age: 62
don't swap
sex: f
age: 68
don't swap
sex: m
age: 35
don't swap
sex: 32
age: f
swap
sex: m
age: 14
don't swap
sex: m
age: 40
don't swap
sex: f
age: 35
don't swap
sex: f
age: 43
don't swap
sex: m
age: 47
don't swap
sex: m
age: 43
don't swap
sex: m
age: 12
don't swap
sex: f
age: 55
don't swap
sex: f
age: 33
don't swap
sex: f
age: 30
don't swap
sex: 36
age: m
swap
sex: f
age: 56
don't swap
sex: f
age: 78
don't swap
sex: m
age: 69
don't swap
sex: m
age: 48
don't swap
se

age: 35
don't swap
sex: m
age: 52
don't swap
sex: m
age: 37
don't swap
sex: m
age: 57
don't swap
sex: f
age: 39
don't swap
sex: f
age: 55
don't swap
sex: f
age: 60
don't swap
sex: m
age: 58
don't swap
sex: 42
age: m
swap
sex: f
age: 69
don't swap
sex: m
age: 16
don't swap
sex: f
age: 73
don't swap
sex: m
age: 7
don't swap
sex: f
age: 55
don't swap
sex: f
age: 71
don't swap
sex: f
age: 24
don't swap
sex: f
age: 67
don't swap
sex: f
age: 48
don't swap
sex: 52
age: m
swap
sex: m
age: 47
don't swap
sex: f
age: 52
don't swap
sex: m
age: 84
don't swap
sex: f
age: 30
don't swap
sex: f
age: 38
don't swap
sex: m
age: 29
don't swap
sex: 57
age: f
swap
sex: m
age: 74
don't swap
sex: 41
age: f
swap
sex: m
age: 66
don't swap
sex: f
age: 27
don't swap
sex: f
age: 16
don't swap
sex: m
age: 3.72
don't swap
sex: f
age: 28
don't swap
sex: f
age: 62
don't swap
sex: f
age: 81
don't swap
sex: f
age: 65
don't swap
sex: f
age: 56
don't swap
sex: 52
age: f
swap
sex: m
age: 27
don't swap
sex: f
age: 41
don't s

sex: m
age: 62
don't swap
sex: f
age: 6
don't swap
sex: m
age: 81
don't swap
sex: f
age: 36
don't swap
sex: f
age: 52
don't swap
sex: f
age: 50
don't swap
sex: 85
age: f
swap
sex: m
age: 27
don't swap
sex: f
age: 37
don't swap
sex: m
age: 4.4
don't swap
sex: 69
age: f
swap
sex: m
age: 84
don't swap
sex: m
age: 18
don't swap
sex: f
age: 12
don't swap
sex: f
age: 11
don't swap
sex: m
age: 61
don't swap
sex: f
age: 36
don't swap
sex: 36
age: f
swap
sex: f
age: 27
don't swap
sex: f
age: 70
don't swap
sex: m
age: 78
don't swap
sex: f
age: 70
don't swap
sex: m
age: 28
don't swap
sex: m
age: 31
don't swap
sex: f
age: 11
don't swap
sex: m
age: 43
don't swap
sex: m
age: 34
don't swap
sex: m
age: 56
don't swap
sex: f
age: 50
don't swap
sex: f
age: 63
don't swap
sex: f
age: 68
don't swap
sex: m
age: 77
don't swap
sex: f
age: 44
don't swap
sex: m
age: 37
don't swap
sex: m
age: 8
don't swap
sex: f
age: 62
don't swap
sex: m
age: 83
don't swap
sex: f
age: 74
don't swap
sex: m
age: 12
don't swap
sex: 

age: m
swap
sex: f
age: 18
don't swap
sex: m
age: 84
don't swap
sex: 3.64
age: m
swap
sex: m
age: 18
don't swap
sex: m
age: 36
don't swap
sex: f
age: 82
don't swap
sex: m
age: 54
don't swap
sex: f
age: 54
don't swap
sex: m
age: 51
don't swap
sex: m
age: 9
don't swap
sex: m
age: 48
don't swap
sex: f
age: 77
don't swap
sex: f
age: 15
don't swap
sex: f
age: 18
don't swap
sex: f
age: 56
don't swap
sex: m
age: 10
don't swap
sex: f
age: 4.4
don't swap
sex: m
age: 73
don't swap
sex: f
age: 77
don't swap
sex: m
age: 54
don't swap
sex: m
age: 65
don't swap
sex: f
age: 72
don't swap
sex: f
age: 73
don't swap
sex: f
age: 62
don't swap
sex: f
age: 29
don't swap
sex: m
age: 60
don't swap
sex: m
age: 43
don't swap
sex: f
age: 14
don't swap
sex: f
age: 30
don't swap
sex: m
age: 64
don't swap
sex: f
age: 43
don't swap
sex: m
age: 13
don't swap
sex: m
age: 78
don't swap
sex: f
age: 49
don't swap
sex: f
age: 44
don't swap
sex: m
age: 35
don't swap
sex: m
age: 67
don't swap
sex: f
age: 61
don't swap
sex:

don't swap
sex: m
age: 28
don't swap
sex: m
age: 74
don't swap
sex: m
age: 19
don't swap
sex: m
age: 69
don't swap
sex: f
age: 32
don't swap
sex: 59
age: m
swap
sex: m
age: 58
don't swap
sex: m
age: 33
don't swap
sex: m
age: 39
don't swap
sex: m
age: 57
don't swap
sex: f
age: 36
don't swap
sex: f
age: 52
don't swap
sex: f
age: 37
don't swap
sex: m
age: 11
don't swap
sex: m
age: 71
don't swap
sex: m
age: 10
don't swap
sex: f
age: 30
don't swap
sex: f
age: 32
don't swap
sex: f
age: 71
don't swap
sex: f
age: 38
don't swap
sex: f
age: 38
don't swap
sex: f
age: 30
don't swap
sex: f
age: 84
don't swap
sex: m
age: 59
don't swap
sex: f
age: 45
don't swap
sex: f
age: 54
don't swap
sex: m
age: 4.4
don't swap
sex: m
age: 11
don't swap
sex: f
age: 51
don't swap
sex: m
age: 46
don't swap
sex: f
age: 72
don't swap
sex: m
age: 44
don't swap
sex: m
age: 5
don't swap
sex: f
age: 46
don't swap
sex: m
age: 21
don't swap
sex: f
age: 50
don't swap
sex: m
age: 3.88
don't swap
sex: f
age: 35
don't swap
sex: 

don't swap
sex: f
age: 4.64
don't swap
sex: f
age: 19
don't swap
sex: m
age: 53
don't swap
sex: m
age: 35
don't swap
sex: f
age: 52
don't swap
sex: f
age: 55
don't swap
sex: m
age: 66
don't swap
sex: m
age: 50
don't swap
sex: f
age: 31
don't swap
sex: 45
age: f
swap
sex: m
age: 80
don't swap
sex: f
age: 31
don't swap
sex: f
age: 54
don't swap
sex: f
age: 82
don't swap
sex: f
age: 26
don't swap
sex: m
age: 4.48
don't swap
sex: f
age: 14
don't swap
sex: m
age: 78
don't swap
sex: m
age: 31
don't swap
sex: m
age: 23
don't swap
sex: m
age: 74
don't swap
sex: f
age: 61
don't swap
sex: m
age: 53
don't swap
sex: f
age: 46
don't swap
sex: m
age: 82
don't swap
sex: f
age: 66
don't swap
sex: f
age: 53
don't swap
sex: f
age: 28
don't swap
sex: m
age: 68
don't swap
sex: 66
age: f
swap
sex: m
age: 24
don't swap
sex: m
age: 33
don't swap
sex: m
age: 5
don't swap
sex: f
age: 49
don't swap
sex: m
age: 23
don't swap
sex: f
age: 71
don't swap
sex: m
age: 22
don't swap
sex: f
age: 8
don't swap
sex: f
age:

age: 63
don't swap
sex: f
age: 54
don't swap
sex: f
age: 47
don't swap
sex: m
age: 47
don't swap
sex: m
age: 14
don't swap
sex: f
age: 34
don't swap
sex: m
age: 7
don't swap
sex: m
age: 19
don't swap
sex: m
age: 54
don't swap
sex: f
age: 51
don't swap
sex: f
age: 42
don't swap
sex: f
age: 27
don't swap
sex: f
age: 64
don't swap
sex: f
age: 78
don't swap
sex: f
age: 50
don't swap
sex: f
age: 71
don't swap
sex: m
age: 44
don't swap
sex: f
age: 20
don't swap
sex: f
age: 52
don't swap
sex: f
age: 13
don't swap
sex: m
age: 70
don't swap
sex: f
age: 61
don't swap
sex: f
age: 11
don't swap
sex: f
age: 3.16
don't swap
sex: f
age: 46
don't swap
sex: f
age: 4.24
don't swap
sex: m
age: 39
don't swap
sex: f
age: 16
don't swap
sex: f
age: 27
don't swap
sex: f
age: 38
don't swap
sex: m
age: 39
don't swap
sex: f
age: 59
don't swap
sex: f
age: 11
don't swap
sex: f
age: 26
don't swap
sex: f
age: 38
don't swap
sex: 65
age: m
swap
sex: m
age: 18
don't swap
sex: m
age: 4.88
don't swap
sex: f
age: 52
don't

age: 20
don't swap
sex: f
age: 51
don't swap
sex: 24
age: f
swap
sex: 24
age: f
swap
sex: f
age: 14
don't swap
sex: f
age: 60
don't swap
sex: f
age: 35
don't swap
sex: f
age: 85
don't swap
sex: m
age: 51
don't swap
sex: f
age: 42
don't swap
sex: m
age: 63
don't swap
sex: f
age: 25
don't swap
sex: f
age: 8
don't swap
sex: m
age: 61
don't swap
sex: m
age: 36
don't swap
sex: m
age: 20
don't swap
sex: f
age: 34
don't swap
sex: m
age: 18
don't swap
sex: m
age: 6
don't swap
sex: m
age: 14
don't swap
sex: m
age: 44
don't swap
sex: f
age: 33
don't swap
sex: m
age: 22
don't swap
sex: m
age: 85
don't swap
sex: f
age: 39
don't swap
sex: m
age: 57
don't swap
sex: f
age: 36
don't swap
sex: m
age: 16
don't swap
sex: m
age: 11
don't swap
sex: m
age: 18
don't swap
sex: f
age: 12
don't swap
sex: m
age: 56
don't swap
sex: f
age: 44
don't swap
sex: m
age: 48
don't swap
sex: m
age: 71
don't swap
sex: f
age: 4
don't swap
sex: f
age: 12
don't swap
sex: m
age: 47
don't swap
sex: f
age: 48
don't swap
sex: f
a

age: 37
don't swap
sex: f
age: 10
don't swap
sex: f
age: 21
don't swap
sex: f
age: 12
don't swap
sex: f
age: 63
don't swap
sex: f
age: 35
don't swap
sex: f
age: 80
don't swap
sex: 64
age: f
swap
sex: f
age: 60
don't swap
sex: f
age: 42
don't swap
sex: m
age: 49
don't swap
sex: f
age: 57
don't swap
sex: f
age: 76
don't swap
sex: f
age: 32
don't swap
sex: m
age: 15
don't swap
sex: f
age: 74
don't swap
sex: f
age: 40
don't swap
sex: f
age: 29
don't swap
sex: f
age: 69
don't swap
sex: m
age: 40
don't swap
sex: f
age: 22
don't swap
sex: f
age: 31
don't swap
sex: f
age: 47
don't swap
sex: 57
age: f
swap
sex: f
age: 29
don't swap
sex: f
age: 21
don't swap
sex: m
age: 3.8
don't swap
sex: m
age: 81
don't swap
sex: f
age: 33
don't swap
sex: f
age: 26
don't swap
sex: f
age: 37
don't swap
sex: m
age: 59
don't swap
sex: f
age: 67
don't swap
sex: m
age: 46
don't swap
sex: f
age: 58
don't swap
sex: m
age: 39
don't swap
sex: m
age: 37
don't swap
sex: m
age: 55
don't swap
sex: f
age: 7
don't swap
sex: 

age: 54
don't swap
sex: f
age: 41
don't swap
sex: m
age: 43
don't swap
sex: 27
age: m
swap
sex: f
age: 34
don't swap
sex: f
age: 18
don't swap
sex: 81
age: f
swap
sex: m
age: 60
don't swap
sex: f
age: 52
don't swap
sex: f
age: 31
don't swap
sex: f
age: 54
don't swap
sex: f
age: 5
don't swap
sex: m
age: 54
don't swap
sex: m
age: 68
don't swap
sex: f
age: 56
don't swap
sex: f
age: 50
don't swap
sex: m
age: 81
don't swap
sex: f
age: 40
don't swap
sex: m
age: 10
don't swap
sex: f
age: 25
don't swap
sex: m
age: 47
don't swap
sex: m
age: 24
don't swap
sex: m
age: 78
don't swap
sex: f
age: 25
don't swap
sex: m
age: 52
don't swap
sex: m
age: 72
don't swap
sex: f
age: 61
don't swap
sex: m
age: 61
don't swap
sex: f
age: 27
don't swap
sex: f
age: 12
don't swap
sex: 5
age: m
swap
sex: m
age: 8
don't swap
sex: f
age: 21
don't swap
sex: f
age: 37
don't swap
sex: f
age: 57
don't swap
sex: f
age: 60
don't swap
sex: f
age: 48
don't swap
sex: f
age: 25
don't swap
sex: f
age: 11
don't swap
sex: f
age: 23

age: 19
don't swap
sex: m
age: 43
don't swap
sex: f
age: 45
don't swap
sex: f
age: 49
don't swap
sex: m
age: 65
don't swap
sex: f
age: 76
don't swap
sex: f
age: 55
don't swap
sex: f
age: 48
don't swap
sex: m
age: 85
don't swap
sex: f
age: 67
don't swap
sex: f
age: 54
don't swap
sex: m
age: 61
don't swap
sex: f
age: 55
don't swap
sex: m
age: 40
don't swap
sex: f
age: 9
don't swap
sex: m
age: 67
don't swap
sex: f
age: 51
don't swap
sex: f
age: 81
don't swap
sex: f
age: 82
don't swap
sex: f
age: 64
don't swap
sex: f
age: 58
don't swap
sex: m
age: 60
don't swap
sex: m
age: 54
don't swap
sex: f
age: 85
don't swap
sex: f
age: 23
don't swap
sex: f
age: 65
don't swap
sex: f
age: 20
don't swap
sex: f
age: 75
don't swap
sex: m
age: 64
don't swap
sex: f
age: 57
don't swap
sex: f
age: 26
don't swap
sex: m
age: 53
don't swap
sex: m
age: 63
don't swap
sex: f
age: 57
don't swap
sex: f
age: 72
don't swap
sex: m
age: 39
don't swap
sex: f
age: 66
don't swap
sex: f
age: 4.24
don't swap
sex: f
age: 27
don

don't swap
sex: f
age: 7
don't swap
sex: f
age: 33
don't swap
sex: m
age: 60
don't swap
sex: m
age: 27
don't swap
sex: m
age: 63
don't swap
sex: f
age: 74
don't swap
sex: f
age: 33
don't swap
sex: m
age: 27
don't swap
sex: m
age: 38
don't swap
sex: f
age: 40
don't swap
sex: m
age: 60
don't swap
sex: f
age: 24
don't swap
sex: m
age: 51
don't swap
sex: f
age: 50
don't swap
sex: m
age: 59
don't swap
sex: f
age: 8
don't swap
sex: m
age: 16
don't swap
sex: m
age: 82
don't swap
sex: f
age: 51
don't swap
sex: f
age: 52
don't swap
sex: f
age: 46
don't swap
sex: f
age: 12
don't swap
sex: m
age: 8
don't swap
sex: f
age: 27
don't swap
sex: f
age: 56
don't swap
sex: f
age: 61
don't swap
sex: f
age: 59
don't swap
sex: f
age: 24
don't swap
sex: f
age: 79
don't swap
sex: m
age: 43
don't swap
sex: m
age: 81
don't swap
sex: m
age: 9
don't swap
sex: f
age: 25
don't swap
sex: m
age: 66
don't swap
sex: f
age: 35
don't swap
sex: f
age: 29
don't swap
sex: f
age: 58
don't swap
sex: f
age: 36
don't swap
sex: 

age: 68
don't swap
sex: m
age: 31
don't swap
sex: m
age: 37
don't swap
sex: m
age: 66
don't swap
sex: f
age: 24
don't swap
sex: m
age: 21
don't swap
sex: f
age: 71
don't swap
sex: 17
age: f
swap
sex: m
age: 55
don't swap
sex: m
age: 50
don't swap
sex: f
age: 49
don't swap
sex: f
age: 67
don't swap
sex: m
age: 69
don't swap
sex: f
age: 61
don't swap
sex: m
age: 73
don't swap
sex: 17
age: f
swap
sex: m
age: 10
don't swap
sex: f
age: 43
don't swap
sex: f
age: 57
don't swap
sex: f
age: 61
don't swap
sex: f
age: 27
don't swap
sex: 24
age: f
swap
sex: m
age: 80
don't swap
sex: m
age: 58
don't swap
sex: f
age: 40
don't swap
sex: m
age: 63
don't swap
sex: f
age: 24
don't swap
sex: f
age: 82
don't swap
sex: m
age: 17
don't swap
sex: f
age: 7
don't swap
sex: f
age: 55
don't swap
sex: f
age: 9
don't swap
sex: 18
age: f
swap
sex: f
age: 71
don't swap
sex: f
age: 52
don't swap
sex: f
age: 58
don't swap
sex: m
age: 39
don't swap
sex: f
age: 32
don't swap
sex: m
age: 61
don't swap
sex: m
age: 47
don'

age: 84
don't swap
sex: f
age: 85
don't swap
sex: m
age: 26
don't swap
sex: f
age: 81
don't swap
sex: m
age: 18
don't swap
sex: m
age: 40
don't swap
sex: m
age: 5
don't swap
sex: f
age: 52
don't swap
sex: f
age: 44
don't swap
sex: f
age: 40
don't swap
sex: f
age: 80
don't swap
sex: f
age: 48
don't swap
sex: f
age: 64
don't swap
sex: f
age: 9
don't swap
sex: m
age: 71
don't swap
sex: m
age: 50
don't swap
sex: m
age: 11
don't swap
sex: f
age: 41
don't swap
sex: f
age: 73
don't swap
sex: m
age: 4.8
don't swap
sex: f
age: 48
don't swap
sex: f
age: 53
don't swap
sex: f
age: 60
don't swap
sex: f
age: 32
don't swap
sex: f
age: 70
don't swap
sex: f
age: 29
don't swap
sex: f
age: 35
don't swap
sex: m
age: 81
don't swap
sex: f
age: 33
don't swap
sex: f
age: 65
don't swap
sex: f
age: 68
don't swap
sex: f
age: 84
don't swap
sex: f
age: 29
don't swap
sex: m
age: 49
don't swap
sex: 55
age: f
swap
sex: m
age: 25
don't swap
sex: f
age: 61
don't swap
sex: m
age: 15
don't swap
sex: m
age: 16
don't swap


sex: m
age: 73
don't swap
sex: f
age: 18
don't swap
sex: m
age: 58
don't swap
sex: m
age: 42
don't swap
sex: f
age: 8
don't swap
sex: f
age: 50
don't swap
sex: m
age: 81
don't swap
sex: m
age: 77
don't swap
sex: f
age: 51
don't swap
sex: m
age: 43
don't swap
sex: f
age: 49
don't swap
sex: 37
age: m
swap
sex: f
age: 31
don't swap
sex: 4.48
age: m
swap
sex: f
age: 27
don't swap
sex: m
age: 10
don't swap
sex: f
age: 48
don't swap
sex: f
age: 65
don't swap
sex: m
age: 72
don't swap
sex: m
age: 53
don't swap
sex: f
age: 51
don't swap
sex: m
age: 69
don't swap
sex: f
age: 49
don't swap
sex: f
age: 54
don't swap
sex: f
age: 50
don't swap
sex: f
age: 70
don't swap
sex: f
age: 46
don't swap
sex: f
age: 83
don't swap
sex: m
age: 26
don't swap
sex: f
age: 64
don't swap
sex: f
age: 44
don't swap
sex: f
age: 8
don't swap
sex: f
age: 22
don't swap
sex: f
age: 5
don't swap
sex: m
age: 85
don't swap
sex: m
age: 30
don't swap
sex: f
age: 67
don't swap
sex: m
age: 43
don't swap
sex: f
age: 35
don't swap

age: 58
don't swap
sex: f
age: 44
don't swap
sex: f
age: 59
don't swap
sex: f
age: 70
don't swap
sex: 11
age: f
swap
sex: f
age: 47
don't swap
sex: m
age: 8
don't swap
sex: f
age: 31
don't swap
sex: f
age: 60
don't swap
sex: m
age: 63
don't swap
sex: f
age: 65
don't swap
sex: m
age: 47
don't swap
sex: m
age: 65
don't swap
sex: f
age: 80
don't swap
sex: f
age: 10
don't swap
sex: m
age: 25
don't swap
sex: f
age: 13
don't swap
sex: m
age: 22
don't swap
sex: m
age: 17
don't swap
sex: m
age: 59
don't swap
sex: f
age: 44
don't swap
sex: f
age: 18
don't swap
sex: f
age: 45
don't swap
sex: f
age: 45
don't swap
sex: f
age: 22
don't swap
sex: m
age: 65
don't swap
sex: m
age: 83
don't swap
sex: f
age: 55
don't swap
sex: f
age: 43
don't swap
sex: 39
age: f
swap
sex: f
age: 5
don't swap
sex: m
age: 56
don't swap
sex: f
age: 30
don't swap
sex: f
age: 34
don't swap
sex: m
age: 11
don't swap
sex: f
age: 37
don't swap
sex: f
age: 23
don't swap
sex: f
age: 14
don't swap
sex: m
age: 11
don't swap
sex: m


In [37]:
stk[~stk["sex_corr"].isin(['f', 'm'])]

Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,...,job_status_living_area,job_status,living_area,job_status_corr,living_area_corr,sex_and_age,sex,age,sex_corr,age_corr


In [38]:
print(stk["sex_corr"].value_counts())
print(stk["age_corr"].unique())

f    5179
m    3506
Name: sex_corr, dtype: int64
['36' '40' '59' '33' '22' '60' '83' '47' '82' '49' '58' '4.88' '38' '50'
 '84' '68' '52' '62' '35' '32' '14' '43' '12' '55' '30' '56' '78' '69'
 '48' '64' '16' '63' '26' '29' '61' '79' '20' '5' '42' '72' '73' '21' '34'
 '27' '41' '31' '18' '85' '3.72' '51' '4.08' '71' '76' '45' '67' '19' '17'
 '46' '7' '65' '66' '81' '54' '39' '3.32' '6' '53' '28' '37' '57' '24' '8'
 '80' '4' '13' '44' '4.72' '25' '3.88' '74' '77' '4.64' '15' '9' '4.24'
 '23' '70' '11' '3.56' '75' '4.32' '3.48' '3.64' '4.4' '4.16' '10' '3.8'
 '4.48' '3.08' '4.56' '3.16' '4.8' '3.24' '3.4']


We're not done yet. We need to convert these ages to floats. The remaining variables are to be cleaned as well, although I suspect that won't be as difficult as the first 2. 

In [39]:
stk["age_corr"] = stk["age_corr"].apply(lambda x: int(float(x)))
print(stk["age_corr"].unique())

[36 40 59 33 22 60 83 47 82 49 58  4 38 50 84 68 52 62 35 32 14 43 12 55
 30 56 78 69 48 64 16 63 26 29 61 79 20  5 42 72 73 21 34 27 41 31 18 85
  3 51 71 76 45 67 19 17 46  7 65 66 81 54 39  6 53 28 37 57 24  8 80 13
 44 25 74 77 15  9 23 70 11 75 10]


In [40]:
print(stk.shape)
stk.head()

(8685, 23)


Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,...,job_status_living_area,job_status,living_area,job_status_corr,living_area_corr,sex_and_age,sex,age,sex_corr,age_corr
0,33327,"F, 36",0.0,0.0,1.0,private_sector?Remote,76.05,33.4,active_smoker,,...,"(private_sector, Remote)",private_sector,remote,private_sector,remote,"(F, 36)",f,36,f,36
1,839,"F, 40",0.0,0.0,1.0,City?government,73.77,30.1,non-smoker,,...,"(City, government)",city,government,government,city,"(F, 40)",f,40,f,40
2,11127,"M, 59",0.0,0.0,1.0,business_owner?Remote,62.95,30.8,,,...,"(business_owner, Remote)",business_owner,remote,business_owner,remote,"(M, 59)",m,59,m,59
3,20768,"33, F",0.0,0.0,1.0,private_sector?City,68.81,36.5,quit,,...,"(private_sector, City)",private_sector,city,private_sector,city,"(33, F)",33,f,f,33
4,37774,"F, 22",0.0,0.0,0.0,private_sector?City,122.89,30.8,active_smoker,,...,"(private_sector, City)",private_sector,city,private_sector,city,"(F, 22)",f,22,f,22


### The remaining variables

<div class="alert alert-block alert-info">
High_BP
</div>

In [41]:
print(stk["high_BP"].unique())
print("---")
print(stk["high_BP"].value_counts())
print("---")
print("NaN values: {}".format(stk[stk["high_BP"].isnull()].shape[0]))

stk = stk[stk["high_BP"].isin(['0', '1'])]
print(stk.shape)

[ 0.  1. nan]
---
0.0    7871
1.0     808
Name: high_BP, dtype: int64
---
NaN values: 6
(8679, 23)


<div class="alert alert-block alert-info">
heart_condition_detected_2017
</div>

In [42]:
print(stk["heart_condition_detected_2017"].unique())
print("---")
print(stk["heart_condition_detected_2017"].value_counts())
print("---")
print("NaN values: {}".format(stk[stk["heart_condition_detected_2017"].isnull()].shape[0]))

stk = stk[stk["heart_condition_detected_2017"].isin(['0', '1'])]
print(stk.shape)

[ 0.  1. nan]
---
0.0    8265
1.0     412
Name: heart_condition_detected_2017, dtype: int64
---
NaN values: 2
(8677, 23)


In [43]:
print(stk["married"].unique())
print("---")
print(stk["married"].value_counts())
print("---")
print("NaN values: {}".format(stk[stk["married"].isnull()].shape[0]))

stk = stk[stk["married"].isin(['0', '1'])]
print(stk.shape)

[ 1.  0. nan]
---
1.0    5617
0.0    3059
Name: married, dtype: int64
---
NaN values: 1
(8676, 23)


<div class="alert alert-block alert-info">
smoker_status
</div>

In [44]:
print(stk["smoker_status"].unique())
print(stk["smoker_status"].value_counts())
print("NaN values: {}".format(stk[stk["smoker_status"].isnull()].shape[0]))

smoker_status_keep = ["non-smoker", "quit", "active_smoker"]
stk = stk[stk["smoker_status"].isin(['non-smoker', 'quit', 'active_smoker'])]
stk.dropna(axis=0, how="all", subset=["smoker_status"], inplace=True)
print(stk.shape)

['active_smoker' 'non-smoker' nan 'quit' '?' '>']
non-smoker       3261
quit             1476
active_smoker    1246
?                   3
>                   1
Name: smoker_status, dtype: int64
NaN values: 2689
(5983, 23)


<div class="alert alert-block alert-warning">
Approximately one-third of the column is null values. However, the imbalance of classes
remain similar even after dropping these so I think it will be fine.

I am assuming that being a smoker will have some correlation with stroke risk.
</div>

In [45]:
print(stk["average_blood_sugar"].unique()[:5])
print(stk["average_blood_sugar"].value_counts().iloc[:5])
print("NaN values: {}".format(stk[stk["average_blood_sugar"].isnull()].shape[0]))

stk.dropna(axis=0, how="all", subset=["average_blood_sugar"], inplace=True)

[ 76.05  73.77  68.81 122.89 116.97]
77.13     7
106.82    6
69.29     6
84.77     6
85.98     6
Name: average_blood_sugar, dtype: int64
NaN values: 0


In [46]:
stk[stk["BMI"]=="."]

Unnamed: 0,id,sex and age,high_BP,heart_condition_detected_2017,married,job_status and living_area,average_blood_sugar,BMI,smoker_status,TreatmentA,...,job_status_living_area,job_status,living_area,job_status_corr,living_area_corr,sex_and_age,sex,age,sex_corr,age_corr
7563,83,"F, 64",0.0,0.0,1.0,business_owner?Remote,206.81,.,non-smoker,,...,"(business_owner, Remote)",business_owner,remote,business_owner,remote,"(F, 64)",f,64,f,64


In [47]:
stk.loc[stk["BMI"]==".", "BMI"] = 24.0

In [48]:
print(stk["BMI"].unique()[:5])
print(stk["BMI"].value_counts().iloc[:5])
print("NaN values: {}".format(stk[stk["BMI"].isnull()].shape[0]))

stk.dropna(axis=0, how="all", subset=["BMI"], inplace=True)

['33.4' '30.1' '36.5' '30.8' '30.7']
28.3    49
31.9    45
31.2    44
27.6    44
29.9    44
Name: BMI, dtype: int64
NaN values: 198


In [49]:
stk.shape

(5785, 23)

## Finalising the training data set

We consider the initial stages of data wrangling complete. Feature engineering continues in the next notebook, let's run the data through a few quick checks. 

In [50]:
stk.columns

Index(['id', 'sex and age', 'high_BP', 'heart_condition_detected_2017',
       'married', 'job_status and living_area', 'average_blood_sugar', 'BMI',
       'smoker_status', 'TreatmentA', 'TreatmentB', 'TreatmentC', 'TreatmentD',
       'job_status_living_area', 'job_status', 'living_area',
       'job_status_corr', 'living_area_corr', 'sex_and_age', 'sex', 'age',
       'sex_corr', 'age_corr'],
      dtype='object')

In [51]:
columns_keep = ["id", "high_BP", "heart_condition_detected_2017",
                "married", "smoker_status", 
                "average_blood_sugar", "BMI",
                "job_status_corr", "living_area_corr", 
                "sex_corr", "age_corr",
            ]

stk_clean = stk[columns_keep]

In [52]:
stk_clean.smoker_status.isnull().value_counts()

False    5785
Name: smoker_status, dtype: int64

In [53]:
stk_clean.dtypes

id                                 int64
high_BP                          float64
heart_condition_detected_2017    float64
married                          float64
smoker_status                     object
average_blood_sugar              float64
BMI                               object
job_status_corr                   object
living_area_corr                  object
sex_corr                          object
age_corr                           int64
dtype: object

In [54]:
print(stk_clean.shape)

(5785, 11)


In [55]:
stk_clean.loc[:, "average_blood_sugar"] = stk_clean.loc[:, "average_blood_sugar"].apply(lambda x: float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [56]:
stk_clean.loc[:, "BMI"] = stk_clean.loc[:, "BMI"].apply(lambda x: float(x))

In [57]:
stk_clean["average_blood_sugar"].describe()

count    5785.000000
mean      110.865570
std        44.895935
min        59.620000
25%        82.410000
50%        96.750000
75%       118.500000
max       278.640000
Name: average_blood_sugar, dtype: float64

In [58]:
stk_clean["BMI"].describe()

count    5785.000000
mean       32.360847
std         7.199487
min        12.400000
25%        27.300000
50%        31.200000
75%        36.200000
max        94.300000
Name: BMI, dtype: float64

In [59]:
stk_clean.dtypes

id                                 int64
high_BP                          float64
heart_condition_detected_2017    float64
married                          float64
smoker_status                     object
average_blood_sugar              float64
BMI                              float64
job_status_corr                   object
living_area_corr                  object
sex_corr                          object
age_corr                           int64
dtype: object

In [60]:
break

SyntaxError: 'break' outside loop (<ipython-input-60-6aaf1f276005>, line 4)

In [61]:
stk_clean.to_csv("../data/test_processed.csv", index=False)

### Quick Plots

Quickly checking the distribution of the continuous variables

In [None]:
sns.distplot(stk_clean[stk_clean["stroke_in_2018"] == '1']["average_blood_sugar"], 
             kde = False, bins = 100, color = "black")
sns.distplot(stk_clean[stk_clean["stroke_in_2018"] == '0']["average_blood_sugar"], 
             kde = False, bins = 100, color = "purple")

In [None]:
sns.distplot(stk_clean[stk_clean["stroke_in_2018"] == '1']["BMI"], 
             kde = False, bins = 100, color = "black")
sns.distplot(stk_clean[stk_clean["stroke_in_2018"] == '0']["BMI"], 
             kde = False, bins = 100, color = "purple")