In [142]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [143]:
heart_df=pd.read_csv('cdc_1.csv')
heart_df.head()

Unnamed: 0.1,Unnamed: 0,SEX,GENERAL_HEALTH,PHYSICAL_HEALTH,MENTAL_HEALTH_30,HEALTH_PLAN,ROUTIN_CHECKUP,EXERCISE,DEPRESSION,DIABETES,...,SMOKE_CIGARETT,DRINKING,BP,CHOLESTEROL,AGE,PREV_HEART_ATTACK,HEART_DISEASE_TARGET,HEART_STROKE,WORK_STATUS,MARIJUANA
0,3,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,4.0,2.0,2.0,2.0,62.0,2.0,2.0,2.0,7.0,0.0
1,6,1.0,1.0,3.0,1.0,1.0,1.0,2.0,1.0,3.0,...,4.0,1.0,2.0,1.0,63.0,2.0,2.0,2.0,8.0,0.0
2,7,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,...,3.0,1.0,2.0,2.0,62.0,1.0,1.0,2.0,2.0,0.0
3,19,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,...,4.0,1.0,2.0,2.0,53.0,2.0,2.0,2.0,1.0,0.0
4,29,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,...,3.0,1.0,1.0,1.0,47.0,2.0,2.0,2.0,8.0,0.0


## Rename Values for Tableau

In [144]:
heart_df.columns

Index(['Unnamed: 0', 'SEX', 'GENERAL_HEALTH', 'PHYSICAL_HEALTH',
       'MENTAL_HEALTH_30', 'HEALTH_PLAN', 'ROUTIN_CHECKUP', 'EXERCISE',
       'DEPRESSION', 'DIABETES', 'MARITAL_STATUS', 'EDUCATION', 'LAST_ASTHMA',
       'BMI', 'CONCENTRATION_DIFF', 'SMOKE_CIGARETT', 'DRINKING', 'BP',
       'CHOLESTEROL', 'AGE', 'PREV_HEART_ATTACK', 'HEART_DISEASE_TARGET',
       'HEART_STROKE', 'WORK_STATUS', 'MARIJUANA'],
      dtype='object')

In [145]:
tableau_df=heart_df.drop(columns=['Unnamed: 0']).copy()

In [146]:
for x in tableau_df.columns:
    print('*'*40)
    print (f'   {x}     ')
    print(tableau_df[x].value_counts())
    print('')

****************************************
   SEX     
2.0    91920
1.0    85875
Name: SEX, dtype: int64

****************************************
   GENERAL_HEALTH     
1.0    153344
2.0     24451
Name: GENERAL_HEALTH, dtype: int64

****************************************
   PHYSICAL_HEALTH     
1.0    118717
2.0     40229
3.0     18849
Name: PHYSICAL_HEALTH, dtype: int64

****************************************
   MENTAL_HEALTH_30     
1.0    99923
2.0    51828
3.0    26044
Name: MENTAL_HEALTH_30, dtype: int64

****************************************
   HEALTH_PLAN     
1.0    167317
2.0     10478
Name: HEALTH_PLAN, dtype: int64

****************************************
   ROUTIN_CHECKUP     
1.0    134139
2.0     24965
3.0     11193
4.0      7017
8.0       481
Name: ROUTIN_CHECKUP, dtype: int64

****************************************
   EXERCISE     
1.0    142135
2.0     35660
Name: EXERCISE, dtype: int64

****************************************
   DEPRESSION     
2.0    137253

In [147]:
tableau_df['SEX'].replace(to_replace=[1,2],
           value=['Male','Female'],inplace=True)
tableau_df['SEX'].value_counts()

Female    91920
Male      85875
Name: SEX, dtype: int64

In [148]:
tableau_df['GENERAL_HEALTH'].replace(to_replace=[1,2],
           value=['Good','Fair/Poor'],inplace=True)
tableau_df['GENERAL_HEALTH'].value_counts()

Good         153344
Fair/Poor     24451
Name: GENERAL_HEALTH, dtype: int64

In [149]:
tableau_df['PHYSICAL_HEALTH'].replace(to_replace=[1,2,3],
           value=['Good','Fair', 'Not Good'],inplace=True)
tableau_df['PHYSICAL_HEALTH'].value_counts()

Good        118717
Fair         40229
Not Good     18849
Name: PHYSICAL_HEALTH, dtype: int64

In [150]:
tableau_df['MENTAL_HEALTH_30'].replace(to_replace=[1,2,3],
           value=['Good','Fair', 'Not Good'],inplace=True)
tableau_df['MENTAL_HEALTH_30'].value_counts()

Good        99923
Fair        51828
Not Good    26044
Name: MENTAL_HEALTH_30, dtype: int64

In [151]:
tableau_df['HEALTH_PLAN'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['HEALTH_PLAN'].value_counts()

Yes    167317
No      10478
Name: HEALTH_PLAN, dtype: int64

In [152]:
tableau_df['ROUTIN_CHECKUP'].replace(to_replace=[1,2,3,4,8],
           value=['year<1','1<year<2', '2<year<5','year>5','never'],inplace=True)
tableau_df['ROUTIN_CHECKUP'].value_counts()

year<1      134139
1<year<2     24965
2<year<5     11193
year>5        7017
never          481
Name: ROUTIN_CHECKUP, dtype: int64

In [153]:
tableau_df['EXERCISE'].replace(to_replace=[1,2],
           value=['Yes','No'],inplace=True)
tableau_df['EXERCISE'].value_counts()

Yes    142135
No      35660
Name: EXERCISE, dtype: int64

In [154]:
tableau_df['DEPRESSION'].replace(to_replace=[1,2],
           value=['Yes','No'],inplace=True)
tableau_df['DEPRESSION'].value_counts()

No     137253
Yes     40542
Name: DEPRESSION, dtype: int64

In [155]:
tableau_df['DIABETES'].replace(to_replace=[1,2,3,4],
           value=['Yes','Gestational', 'No','Borderline'],inplace=True)
tableau_df['DIABETES'].value_counts()

No             153598
Yes             18587
Borderline       3600
Gestational      2010
Name: DIABETES, dtype: int64

In [156]:
tableau_df['MARITAL_STATUS'].replace(to_replace=[1,2,3,4,5,6],
           value=['Married', 'Divorced','Widowed','Separated','Not Married','Not Married'],inplace=True)
tableau_df['MARITAL_STATUS'].value_counts()

Married        102338
Not Married     43185
Divorced        23032
Widowed          5089
Separated        4151
Name: MARITAL_STATUS, dtype: int64

In [157]:
tableau_df['EDUCATION'].replace(to_replace=[1,2,3,4],
           value=['No High School Diploma','High School Diploma', 'Attended College','College Degree'],inplace=True)
tableau_df['EDUCATION'].value_counts()

College Degree            81817
Attended College          48540
High School Diploma       39248
No High School Diploma     8190
Name: EDUCATION, dtype: int64

In [158]:
tableau_df['LAST_ASTHMA'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['LAST_ASTHMA'].value_counts()

No     150732
Yes     27063
Name: LAST_ASTHMA, dtype: int64

In [159]:
tableau_df['WORK_STATUS'].replace(to_replace=[1,2,3,4,5,6,7,8],
           value=['Employed','Self-employed','Out of work (more than a year)',
                  'Out of work (less than a year)','Homemaker','Student','Retired','Unable to work'],inplace=True)
tableau_df['WORK_STATUS'].value_counts()

Employed                          112610
Self-employed                      18069
Unable to work                     12677
Retired                            12047
Homemaker                           7577
Out of work (more than a year)      5628
Out of work (less than a year)      4986
Student                             4201
Name: WORK_STATUS, dtype: int64

In [160]:
tableau_df['BMI'].replace(to_replace=[1,2,3,4],
           value=['Under Weight','Normal', 'Over Weight','Obese'],inplace=True)
tableau_df['BMI'].value_counts()

Obese           67628
Over Weight     61391
Normal          46786
Under Weight     1990
Name: BMI, dtype: int64

In [161]:
tableau_df['SMOKE_CIGARETT'].replace(to_replace=[1,2,3,4],
           value=['Everyday','Somedays','Past','Never'],inplace=True)
tableau_df['SMOKE_CIGARETT'].value_counts()

Never       110589
Past         41199
Everyday     18861
Somedays      7146
Name: SMOKE_CIGARETT, dtype: int64

In [162]:
tableau_df['CONCENTRATION_DIFF'].replace(to_replace=[1,2],
           value=['Yes','No'],inplace=True)
tableau_df['CONCENTRATION_DIFF'].value_counts()

No     158076
Yes     19719
Name: CONCENTRATION_DIFF, dtype: int64

In [163]:
tableau_df['DRINKING'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['DRINKING'].value_counts()

No     145808
Yes     31987
Name: DRINKING, dtype: int64

In [164]:
tableau_df['BP'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['BP'].value_counts()

No     121463
Yes     56332
Name: BP, dtype: int64

In [165]:
tableau_df['CHOLESTEROL'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['CHOLESTEROL'].value_counts()

No     120500
Yes     57295
Name: CHOLESTEROL, dtype: int64

In [166]:
tableau_df['AGE'].value_counts()

62.0    6844
60.0    6795
64.0    6709
63.0    6525
58.0    5964
61.0    5922
59.0    5868
57.0    5560
55.0    5529
50.0    5443
56.0    5296
52.0    5136
54.0    4882
53.0    4807
51.0    4641
40.0    4494
49.0    4219
45.0    4161
42.0    4153
48.0    4083
47.0    4049
43.0    3888
46.0    3851
38.0    3806
41.0    3723
44.0    3721
39.0    3676
37.0    3520
36.0    3395
35.0    3389
34.0    3119
32.0    2988
33.0    2918
30.0    2809
31.0    2601
29.0    2317
28.0    2257
27.0    2052
25.0    1939
26.0    1850
24.0    1683
23.0    1614
22.0    1447
21.0    1309
20.0    1116
19.0     920
18.0     807
Name: AGE, dtype: int64

In [167]:
tableau_df['PREV_HEART_ATTACK'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['PREV_HEART_ATTACK'].value_counts()

No     172368
Yes      5427
Name: PREV_HEART_ATTACK, dtype: int64

In [168]:
tableau_df['HEART_DISEASE_TARGET'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['HEART_DISEASE_TARGET'].value_counts()

No     169861
Yes      7934
Name: HEART_DISEASE_TARGET, dtype: int64

In [169]:
tableau_df['HEART_STROKE'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['HEART_STROKE'].value_counts()

No     173651
Yes      4144
Name: HEART_STROKE, dtype: int64

In [170]:
for x in tableau_df.columns:
    print('*'*40)
    print (f'   {x}     ')
    print(tableau_df[x].value_counts())
    print('')

****************************************
   SEX     
Female    91920
Male      85875
Name: SEX, dtype: int64

****************************************
   GENERAL_HEALTH     
Good         153344
Fair/Poor     24451
Name: GENERAL_HEALTH, dtype: int64

****************************************
   PHYSICAL_HEALTH     
Good        118717
Fair         40229
Not Good     18849
Name: PHYSICAL_HEALTH, dtype: int64

****************************************
   MENTAL_HEALTH_30     
Good        99923
Fair        51828
Not Good    26044
Name: MENTAL_HEALTH_30, dtype: int64

****************************************
   HEALTH_PLAN     
Yes    167317
No      10478
Name: HEALTH_PLAN, dtype: int64

****************************************
   ROUTIN_CHECKUP     
year<1      134139
1<year<2     24965
2<year<5     11193
year>5        7017
never          481
Name: ROUTIN_CHECKUP, dtype: int64

****************************************
   EXERCISE     
Yes    142135
No      35660
Name: EXERCISE, dtype: int64



In [171]:
tableau_df.to_csv('tableau_data.csv')
print('Completed..')

Completed..
