In [86]:
import pandas as pd

In [87]:
pd.set_option('display.max_columns', None)

In [95]:
df = pd.read_csv('data_wrangling_medical_2022_u4587426.csv', dtype=str)

In [96]:
df.columns

Index(['rec_id', 'ssn', 'first_name', 'middle_name', 'last_name', 'gender',
       'age_at_consultation', 'birth_date', 'medicare_number',
       'street_address', 'suburb', 'postcode', 'state', 'phone', 'email',
       'marital_status', 'height', 'weight', 'bmi', 'blood_pressure',
       'cholesterol_level', 'smoking_status', 'clinical_notes',
       'consultation_timestamp'],
      dtype='object')

In [97]:
def hasVal(row, val):
    if pd.isnull(row[val]):
        return '0'
    else:
        return '1'

In [98]:
df['hasEmail'] = df.apply(lambda row: hasVal(row, 'email'), axis=1)

In [99]:
df['hasPhone'] = df.apply(lambda row: hasVal(row, 'phone'), axis=1)

In [100]:
df['hasPostcode'] = df.apply(lambda row: hasVal(row, 'postcode'), axis=1)

In [103]:
df['missingCat'] = df.hasEmail + df.hasPhone + df.hasPostcode

### Count Missing Values by Category

In [104]:
df1 = df.groupby(['missingCat']).count()['rec_id']

In [105]:
display(df1)

missingCat
000     470
001    1888
010     731
011    2915
100    1158
101    4468
110    1647
111    6723
Name: rec_id, dtype: int64

### Correlation BMI and Age at Consultation

In [12]:
c1 = pd.to_numeric(df['bmi']).corr(pd.to_numeric(df['age_at_consultation']))

In [13]:
c1

0.24680562420845412

### Correlation BMI and Height

In [14]:
c2 = pd.to_numeric(df['bmi']).corr(pd.to_numeric(df['height']))

In [15]:
c2

0.1295330393066611

### Correlation State and Marital Status

In [None]:
df = pd.read_csv('data_wrangling_medical_2022_u4587426.csv', dtype=str, na_filter=False)

In [16]:
ct = pd.crosstab(df['state'], df['marital_status'], dropna=False)

In [17]:
display(ct)

marital_status,divorced,married,married-de-facto,n/a,not-married,separated,widowed
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
act,49,380,146,132,255,44,18
nsw,250,2136,918,760,1457,274,106
nt,88,728,306,266,509,83,45
qld,180,1526,605,526,1018,175,67
sa,37,372,158,121,240,59,19
tas,41,380,147,142,237,43,16
vic,169,1477,602,510,1024,168,67
wa,35,340,133,117,234,42,23


In [18]:
import scipy
from scipy.stats import chi2_contingency

In [19]:
chi2_contingency(ct)

(22.402175652438224,
 0.9943206163681827,
 42,
 array([[  43.4688 ,  375.7568 ,  154.368  ,  131.7888 ,  254.6688 ,
           45.4656 ,   18.4832 ],
        [ 250.49745, 2165.37195,  889.57575,  759.4587 , 1467.5787 ,
          262.0044 ,  106.51305],
        [  85.96125,  743.07375,  305.26875,  260.6175 ,  503.6175 ,
           89.91   ,   36.55125],
        [ 173.91765, 1503.39415,  617.62275,  527.2839 , 1018.9239 ,
          181.9068 ,   73.95085],
        [  42.7047 ,  369.1517 ,  151.6545 ,  129.4722 ,  250.1922 ,
           44.6664 ,   18.1583 ],
        [  42.7047 ,  369.1517 ,  151.6545 ,  129.4722 ,  250.1922 ,
           44.6664 ,   18.1583 ],
        [ 170.52165, 1474.03815,  605.56275,  516.9879 ,  999.0279 ,
          178.3548 ,   72.50685],
        [  39.2238 ,  339.0618 ,  139.293  ,  118.9188 ,  229.7988 ,
           41.0256 ,   16.6782 ]]))

### Completeness: Postcode, Phone, Middle Name

In [20]:
df = pd.read_csv('data_wrangling_medical_2022_u4587426.csv', dtype=str)

In [21]:
df.isnull().sum()

rec_id                       0
ssn                          0
first_name                   2
middle_name               2000
last_name                    1
gender                       0
age_at_consultation          0
birth_date                   0
medicare_number              0
street_address               0
suburb                       0
postcode                  4006
state                        0
phone                     7984
email                     6004
marital_status            2574
height                       0
weight                       0
bmi                          0
blood_pressure               0
cholesterol_level            0
smoking_status               0
clinical_notes               0
consultation_timestamp       0
dtype: int64

In [22]:
df.shape

(20000, 24)

#### Postcode

In [23]:
pc_null = df.postcode.isnull().sum()

In [24]:
pc_complete = str((1 - (pc_null / df.shape[0])) * 100) + ' %'

In [25]:
print(pc_complete)

79.97 %


#### Phone

In [26]:
ph_null = df.phone.isnull().sum()

In [27]:
ph_complete = str((1 - (ph_null / df.shape[0])) * 100) + ' %'

In [28]:
print(ph_complete)

60.08 %


#### Middle Name

In [29]:
mn_null = df.middle_name.isnull().sum()

In [30]:
mn_complete = str((1 - (mn_null / df.shape[0])) * 100) + ' %'

In [31]:
print(mn_complete)

90.0 %


### Validity: Weight and Email

#### Weight

In [32]:
df1 = df.groupby(pd.to_numeric(df['weight'])).count()['rec_id']

In [33]:
w_invalid = len(df[(pd.to_numeric(df['weight'])<=0)])

In [34]:
w_valid = str((1 - (w_invalid / df.shape[0])) * 100) + ' %'

In [35]:
print(w_valid)

90.18 %


#### Email

In [36]:
# Should have @ symbol and at least one dot after @ symbol

In [37]:
# Used code from this website: https://stackoverflow.com/questions/742451/what-is-the-simplest-regular-expression-to-validate-emails-to-not-accept-them-bl

In [38]:
email_valid = df.email.str.count(r'^[^@\s]+@[^@\s]+\.[^@\s]+$').sum()

In [39]:
e_valid = str((email_valid / df.shape[0]) * 100) + ' %'

In [40]:
print(e_valid)

60.285 %


### Consistency: Age at Consultation and Date of Birth

In [41]:
# Add column to show inferred age at consultation based on the date of birth and consultation timestamp

In [42]:
# Convert timestamp to standard date formart

In [43]:
from datetime import datetime

In [44]:
df['consult_dt'] = [datetime.strptime(x.split('t')[0], '%Y-%m-%d') for x in df['consultation_timestamp']]

In [45]:
df['dob_dt'] = [datetime.strptime(x, '%d/%m/%Y') for x in df['birth_date']]

In [52]:
def yr_diff(row):
    d1 = row['consult_dt']
    d2 = row['dob_dt']
    diff = round(abs((d1-d2).days)/365, 0)
    return diff

In [53]:
df['der_age'] = df.apply(lambda row: yr_diff(row), axis=1)

In [56]:
df['age_gap'] = pd.to_numeric(df.age_at_consultation) - df.der_age

In [68]:
df1 = df.groupby(['age_gap']).count()['rec_id']

In [72]:
df1 = df1.reset_index()

In [73]:
display(df1)

Unnamed: 0,age_gap,rec_id
0,-4.0,80
1,-3.0,563
2,-2.0,650
3,-1.0,3011
4,0.0,13585
5,1.0,2111


In [80]:
age_valid = df1.loc[df1['age_gap'] == 0.0, 'rec_id'].item()

In [81]:
a_valid = str((age_valid / df.shape[0]) * 100) + ' %'

In [82]:
print(a_valid)

67.925 %


### Modify Medicare Number to be numeric for Benfords analysis

In [83]:
df['medicare_mod'] = pd.to_numeric(df.medicare_number.str.replace(' ', ''))

In [84]:
display(df)

Unnamed: 0,rec_id,ssn,first_name,middle_name,last_name,gender,age_at_consultation,birth_date,medicare_number,street_address,suburb,postcode,state,phone,email,marital_status,height,weight,bmi,blood_pressure,cholesterol_level,smoking_status,clinical_notes,consultation_timestamp,consult_dt,dob_dt,der_age,age_gap,medicare_mod
0,rec-72052,i116534747,donald,lydell,manning,f,13,4/6/2004,1671 32219 1 2,1 gisborne road goodmans,bacchus marsh,3340,vic,03 8224 2324,donald.manning@hotmail.com,,139,67,34,71,134,0,basal-cell-carcinoma---skin-leasion-to-left-as...,2017-11-15t10:10+00:00,2017-11-15,2004-06-04,13.0,0.0,16713221912
1,rec-66635,g196248364,george,allen,morrison,m,10,12/3/2010,9823 28679 1 2,1 leura lane watersley cottage,hamilton,3300,vic,,spfefrcmeq@mail.com,,120,50,34,69,97,0,depressed-mood,2020-07-20t10:22+00:00,2020-07-20,2010-03-12,10.0,0.0,98232867912
2,rec-73551,e111796351,stephon,hill,warren,f,28,25/12/1989,7824 89557 1 1,39 mcnabb loop collier park vlg,como,6152,wa,08 7124 5514,,married,167,141,50,74,129,0,low-energy,2017-02-23t11:13+00:00,2017-02-23,1989-12-25,27.0,1.0,78248955711
3,rec-52311,f113787195,matthew,e,heuer,m,18,3/11/1997,6509 52553 2 3,5 harrys road rivendell,arthurs creek,3099,vic,03 0697 4454,heuer21@mail.com,not-married,172,66,22,73,128,1,atrial-fibrillation,2015-04-04t14:39+00:00,2015-04-04,1997-11-03,17.0,1.0,65095255323
4,rec-49632,g166722356,melisa,,kellogg,f,59,12/11/1957,8926 29925 1 2,9 hallett close riverside garden,douglas,4814,qld,07 3290 2798,,married,185,122,35,73,119,0,low-back-pain,2016-06-13t19:26+00:00,2016-06-13,1957-11-12,59.0,0.0,89262992512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,rec-15872,i161581251,gracie,little,ratcliffe,f,6,12/8/2008,9039 89959 1 2,501 little collins street aprt 701,melbourne,3000,vic,03 0379 4953,gracie.ratcliffe@aol.com,,123,25,16,70,131,0,depression,2014-04-07t23:47+00:00,2014-04-07,2008-08-12,6.0,0.0,90398995912
19996,rec-53302,g152500166,allan,landon,hurdle,m,49,3/8/1968,4148 98301 2 2,5 barrett drive floreat village,alice springs,0870,nt,08 6004 7509,,married,180,138,42,80,152,0,ringing-ears,2017-02-09t20:48+00:00,2017-02-09,1968-08-03,49.0,0.0,41489830122
19997,rec-90066,g178482871,joshua,michael,munn,f,7,22/4/2008,7737 88451 1 2,49 ellis road whispering pines,alstonville,2477,nsw,02 5963 3249,joshua.munn@mail.com.au,,112,26,20,76,187,0,cataract-surgery---right-eye,2015-07-06t07:17+00:00,2015-07-06,2008-04-22,7.0,0.0,77378845112
19998,rec-51517,h169776084,carol,lucille,harms,m,59,1/12/1956,9535 28015 2 1,360 eusdale road woodlands,yetholme,2795,nsw,,harms.carol@mail.com,married,193,123,33,74,138,0,bronchitis,2015-09-19t05:22+00:00,2015-09-19,1956-12-01,59.0,0.0,95352801521


### Export File to CSV

In [85]:
df.to_csv('assign2_clean.csv', index=False)

<hr>