In [1]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib as plt
from sklearn.preprocessing import OrdinalEncoder

#Reads CSV to data frame and converts all 'NA' values to nan
med_raw= pd.read_csv('/Users/herlihpj/Desktop/Data Analytics/D206/ASSESSMENT/Medical Data/medical_raw_data.csv', na_values='NA')

#Methods to Describe and View missing Data
print('First 10 Rows: ')
print(med_raw.head(10))

print('Info: ')
med_raw.info()

print('Describe: ')
print(med_raw.describe())

print('Summary of Null: ')
print(med_raw.isna().sum())

#Check for duplicated data
duplicates=med_raw.duplicated()
print('Duplicates: '+ str(duplicates.sum()))

#Visualize missingness
med_raw_sorted=med_raw.sort_values('Age')
msno.matrix(med_raw_sorted)
#heatmaps
msno.heatmap(med_raw)

med_clean=med_raw.copy()

# Remove unnamed column 
med_clean = med_clean.drop(med_clean.columns[0], axis = 1)

#Add column titles to last 8 columns of the data frame
med_clean.rename(columns = {'Item1':'Timely admission',
    'Item2':'Timely treatment',
    'Item3':'Timely visits',
    'Item4':'Reliability',
    'Item5':'Options',
    'Item6':'Hours of treatment',
    'Item7':'Courteous staff',
    'Item8':'Evidence of active listening'},
inplace=True)

#Clean the Children with the median value 
med_clean['Children'].fillna(med_clean['Children'].median(), inplace=True)

#Age
med_clean['Age'].fillna(int(med_clean['Age'].mean()), inplace=True)

#Impute Income Column
#computes avg income of Full Time employment
full_time=med_clean[med_clean['Employment']=='Full Time']
full_avg=full_time['Income'].mean()
#computes avg income of part time employment
part_time=med_clean[med_clean['Employment']=='Part Time']
part_avg=part_time['Income'].mean()
zero_income=['Retired', 'Student', 'Unemployed']

#Use numpy to replace missing values based on a condition
med_clean['Income'] = np.where((med_clean['Income'].isna()) & (med_clean['Employment'] == 'Full Time'), full_avg,
                            np.where( 
                            (med_clean['Income'].isna()) & (med_clean['Employment'] == 'Part Time'), part_avg,
                            np.where(
                            (med_clean['Income'].isna()) & (med_clean.Employment.isin(zero_income)), 0,
                            med_clean['Income'])))

#Initial Days
med_clean['Initial_days'].fillna(int(med_clean['Initial_days'].mean()), inplace=True)

#Soft drink
med_clean['Soft_drink'].fillna(med_clean['Soft_drink'].mode().iloc[0], inplace=True)

#Overweight
med_clean['Overweight'].fillna(med_clean['Overweight'].mode().iloc[0], inplace=True)

#Anxiety
med_clean['Anxiety'].fillna(med_clean['Anxiety'].mode().iloc[0], inplace=True)

#Ordinal Encoding to convert to numeric 0:No, 1:Yes
oe_dict={}
#list of columns to convert to numerical
convert_cols=['Soft_drink', 'HighBlood', 'Stroke', 'Arthritis', 'Diabetes', 'Hyperlipidemia','BackPain',
             'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma']

for col_name in convert_cols:
    print(col_name+' pre: '+str(med_clean[col_name].unique()))
    #Creates column ordinal encoder
    oe_dict[col_name]=OrdinalEncoder()
    col=med_clean[col_name]
    #select non-null values of col
    col_not_null=col[col.notnull()]
    reshaped_vals=col_not_null.values.reshape(-1,1)
    encoded_vals=oe_dict[col_name].fit_transform(reshaped_vals)
    med_clean.loc[col.notnull(), col_name]=np.squeeze(encoded_vals)
    print(col_name+' post: '+str(med_clean[col_name].unique()))

#Check if the data still contains missing values
print('Cleaned Summary')
print(med_clean.isna().sum())

#Export Cleaned data to a CSV file
med_clean.to_csv('/Users/herlihpj/Desktop/Data Analytics/D206/ASSESSMENT/Medical Data/medical_clean_data.csv')
#csv conversion https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html

ModuleNotFoundError: No module named 'missingno'