In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [81]:
heart_df=pd.read_csv('cdc_1.csv')
heart_df.head()

Unnamed: 0.1,Unnamed: 0,SEX,GENERAL_HEALTH,PHYSICAL_HEALTH,MENTAL_HEALTH_30,HEALTH_PLAN,ROUTIN_CHECKUP,EXERCISE,DEPRESSION,DIABETES,...,CONCENTRATION_DIFF,SMOKE_CIGARETT,DRINKING,BP,CHOLESTEROL,AGE,PREV_HEART_ATTACK,HEART_DISEASE_TARGET,HEART_STROKE,MARIJUANA
0,0,2.0,2.0,3.0,2.0,9.0,2.0,2.0,2.0,3.0,...,2.0,3.0,1.0,1.0,2.0,70.0,2.0,2.0,2.0,0.0
1,2,2.0,1.0,1.0,1.0,9.0,1.0,2.0,2.0,1.0,...,2.0,4.0,1.0,2.0,1.0,72.0,2.0,1.0,2.0,0.0
2,3,2.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,...,2.0,4.0,2.0,2.0,2.0,62.0,2.0,2.0,2.0,0.0
3,4,1.0,2.0,3.0,1.0,9.0,1.0,1.0,2.0,1.0,...,2.0,4.0,1.0,1.0,2.0,76.0,1.0,1.0,1.0,0.0
4,5,1.0,1.0,1.0,1.0,9.0,1.0,2.0,2.0,3.0,...,2.0,3.0,1.0,1.0,1.0,80.0,2.0,2.0,2.0,0.0


## Rename Values for Tableau

In [82]:
heart_df.columns

Index(['Unnamed: 0', 'SEX', 'GENERAL_HEALTH', 'PHYSICAL_HEALTH',
       'MENTAL_HEALTH_30', 'HEALTH_PLAN', 'ROUTIN_CHECKUP', 'EXERCISE',
       'DEPRESSION', 'DIABETES', 'MARITAL_STATUS', 'EDUCATION', 'LAST_ASTHMA',
       'WORK_STATUS', 'BMI', 'CONCENTRATION_DIFF', 'SMOKE_CIGARETT',
       'DRINKING', 'BP', 'CHOLESTEROL', 'AGE', 'PREV_HEART_ATTACK',
       'HEART_DISEASE_TARGET', 'HEART_STROKE', 'MARIJUANA'],
      dtype='object')

In [83]:
tableau_df=heart_df.drop(columns=['Unnamed: 0']).copy()

In [84]:
for x in tableau_df.columns:
    print('*'*40)
    print (f'   {x}     ')
    print(tableau_df[x].value_counts())
    print('')

****************************************
   SEX     
2.0    174823
1.0    158223
Name: SEX, dtype: int64

****************************************
   GENERAL_HEALTH     
1.0    276735
2.0     55675
9.0       636
Name: GENERAL_HEALTH, dtype: int64

****************************************
   PHYSICAL_HEALTH     
1.0    217501
2.0     69608
3.0     39784
9.0      6153
Name: PHYSICAL_HEALTH, dtype: int64

****************************************
   MENTAL_HEALTH_30     
1.0    209389
2.0     79160
3.0     39710
9.0      4787
Name: MENTAL_HEALTH_30, dtype: int64

****************************************
   HEALTH_PLAN     
1.0    183660
9.0    137298
2.0     12088
Name: HEALTH_PLAN, dtype: int64

****************************************
   ROUTIN_CHECKUP     
1.0    268607
2.0     36979
3.0     14979
4.0      9704
7.0      1877
8.0       749
9.0       151
Name: ROUTIN_CHECKUP, dtype: int64

****************************************
   EXERCISE     
1.0    254031
2.0     78512
9.0       503


In [85]:
tableau_df['SEX'].replace(to_replace=[1,2],
           value=['Male','Female'],inplace=True)
tableau_df['SEX'].value_counts()

Female    174823
Male      158223
Name: SEX, dtype: int64

In [86]:
tableau_df['GENERAL_HEALTH'].replace(to_replace=[1,2],
           value=['Good','Fair/Poor'],inplace=True)
tableau_df['GENERAL_HEALTH'].value_counts()

Good         276735
Fair/Poor     55675
9.0             636
Name: GENERAL_HEALTH, dtype: int64

In [87]:
tableau_df['PHYSICAL_HEALTH'].replace(to_replace=[1,2,3],
           value=['Good','Fair', 'Not Good'],inplace=True)
tableau_df['PHYSICAL_HEALTH'].value_counts()

Good        217501
Fair         69608
Not Good     39784
9.0           6153
Name: PHYSICAL_HEALTH, dtype: int64

In [88]:
tableau_df['MENTAL_HEALTH_30'].replace(to_replace=[1,2,3],
           value=['Good','Fair', 'Not Good'],inplace=True)
tableau_df['MENTAL_HEALTH_30'].value_counts()

Good        209389
Fair         79160
Not Good     39710
9.0           4787
Name: MENTAL_HEALTH_30, dtype: int64

In [89]:
tableau_df['HEALTH_PLAN'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['HEALTH_PLAN'].value_counts()

Yes    183660
9.0    137298
No      12088
Name: HEALTH_PLAN, dtype: int64

In [90]:
tableau_df['ROUTIN_CHECKUP'].replace(to_replace=[1,2,3,4,8],
           value=['year<1','1<year<2', '2<year<5','year>5','never'],inplace=True)
tableau_df['ROUTIN_CHECKUP'].value_counts()

year<1      268607
1<year<2     36979
2<year<5     14979
year>5        9704
7.0           1877
never          749
9.0            151
Name: ROUTIN_CHECKUP, dtype: int64

In [91]:
tableau_df['EXERCISE'].replace(to_replace=[1,2],
           value=['Yes','No'],inplace=True)
tableau_df['EXERCISE'].value_counts()

Yes    254031
No      78512
9.0       503
Name: EXERCISE, dtype: int64

In [92]:
tableau_df['DEPRESSION'].replace(to_replace=[1,2],
           value=['Yes','No'],inplace=True)
tableau_df['DEPRESSION'].value_counts()

No     266185
Yes     65543
7.0      1070
9.0       248
Name: DEPRESSION, dtype: int64

In [93]:
tableau_df['DIABETES'].replace(to_replace=[1,2,3,4],
           value=['Yes','Gestational', 'No','Borderline'],inplace=True)
tableau_df['DIABETES'].value_counts()

No             273588
Yes             48222
Borderline       8142
Gestational      2732
7.0               321
9.0                41
Name: DIABETES, dtype: int64

In [94]:
tableau_df['MARITAL_STATUS'].replace(to_replace=[1,2,3,4,5,6],
           value=['Married', 'Divorced','Widowed','Separated','Not Married','Not Married'],inplace=True)
tableau_df['MARITAL_STATUS'].value_counts()

Married        182521
Not Married     59699
Divorced        44557
Widowed         38203
Separated        6090
9.0              1976
Name: MARITAL_STATUS, dtype: int64

In [95]:
tableau_df['EDUCATION'].replace(to_replace=[1,2,3,4],
           value=['No High School Diploma','High School Diploma', 'Attended College','College Degree'],inplace=True)
tableau_df['EDUCATION'].value_counts()

College Degree            143783
Attended College           91450
High School Diploma        80078
No High School Diploma     17001
9.0                          734
Name: EDUCATION, dtype: int64

In [96]:
tableau_df['LAST_ASTHMA'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['LAST_ASTHMA'].value_counts()

No     285916
Yes     46228
9.0       902
Name: LAST_ASTHMA, dtype: int64

In [97]:
tableau_df['WORK_STATUS'].replace(to_replace=[1,2,3,4,5,6,7,8],
           value=['Employed','Self-employed','Out of work (more than a year)',
                  'Out of work (less than a year)','Homemaker','Student','Retired','Unable to work'],inplace=True)
tableau_df['WORK_STATUS'].value_counts()

Employed                          139219
Retired                           110921
Self-employed                      28731
Unable to work                     19788
Homemaker                          12546
Out of work (more than a year)      8107
Out of work (less than a year)      6517
Student                             5256
9.0                                 1961
Name: WORK_STATUS, dtype: int64

In [98]:
tableau_df['BMI'].replace(to_replace=[1,2,3,4],
           value=['Under Weight','Normal', 'Over Weight','Obese'],inplace=True)
tableau_df['BMI'].value_counts()

Over Weight     119738
Obese           115794
Normal           92956
Under Weight      4558
Name: BMI, dtype: int64

In [99]:
tableau_df['SMOKE_CIGARETT'].replace(to_replace=[1,2,3,4],
           value=['Everyday','Somedays','Past','Never'],inplace=True)
tableau_df['SMOKE_CIGARETT'].value_counts()

Never       192578
Past         94520
Everyday     29449
Somedays     11342
9.0           5157
Name: SMOKE_CIGARETT, dtype: int64

In [100]:
tableau_df['CONCENTRATION_DIFF'].replace(to_replace=[1,2],
           value=['Yes','No'],inplace=True)
tableau_df['CONCENTRATION_DIFF'].value_counts()

No     296893
Yes     34380
7.0      1441
9.0       332
Name: CONCENTRATION_DIFF, dtype: int64

In [101]:
tableau_df['DRINKING'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['DRINKING'].value_counts()

No     280203
Yes     40813
9.0     12030
Name: DRINKING, dtype: int64

In [102]:
tableau_df['BP'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['BP'].value_counts()

No     190252
Yes    141950
9.0       844
Name: BP, dtype: int64

In [103]:
tableau_df['CHOLESTEROL'].replace(to_replace=[1,2],
           value=['No','Yes'],inplace=True)
tableau_df['CHOLESTEROL'].value_counts()

No     197054
Yes    133563
9.0      2429
Name: CHOLESTEROL, dtype: int64

In [104]:
tableau_df['AGE'].value_counts()

80.0    27982
65.0     8438
70.0     8132
67.0     7926
68.0     7760
        ...  
22.0     1713
21.0     1647
20.0     1416
19.0     1238
18.0     1196
Name: AGE, Length: 63, dtype: int64

In [105]:
tableau_df['PREV_HEART_ATTACK'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['PREV_HEART_ATTACK'].value_counts()

No     313027
Yes     19705
7.0       306
9.0         8
Name: PREV_HEART_ATTACK, dtype: int64

In [106]:
tableau_df['HEART_DISEASE_TARGET'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['HEART_DISEASE_TARGET'].value_counts()

No     302506
Yes     30540
Name: HEART_DISEASE_TARGET, dtype: int64

In [107]:
tableau_df['HEART_STROKE'].replace(to_replace=[1,2],
           value=['Yes', 'No'],inplace=True)
tableau_df['HEART_STROKE'].value_counts()

No     318266
Yes     14054
7.0       699
9.0        27
Name: HEART_STROKE, dtype: int64

In [108]:
for x in tableau_df.columns:
    print('*'*40)
    print (f'   {x}     ')
    print(tableau_df[x].value_counts())
    print('')

****************************************
   SEX     
Female    174823
Male      158223
Name: SEX, dtype: int64

****************************************
   GENERAL_HEALTH     
Good         276735
Fair/Poor     55675
9.0             636
Name: GENERAL_HEALTH, dtype: int64

****************************************
   PHYSICAL_HEALTH     
Good        217501
Fair         69608
Not Good     39784
9.0           6153
Name: PHYSICAL_HEALTH, dtype: int64

****************************************
   MENTAL_HEALTH_30     
Good        209389
Fair         79160
Not Good     39710
9.0           4787
Name: MENTAL_HEALTH_30, dtype: int64

****************************************
   HEALTH_PLAN     
Yes    183660
9.0    137298
No      12088
Name: HEALTH_PLAN, dtype: int64

****************************************
   ROUTIN_CHECKUP     
year<1      268607
1<year<2     36979
2<year<5     14979
year>5        9704
7.0           1877
never          749
9.0            151
Name: ROUTIN_CHECKUP, dtype: int64

*

In [109]:
tableau_df.to_csv('tableau_data.csv')
print('Completed..')

Completed..
