In [40]:
import matplotlib.pyplot as plt
import pylab as py
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
from sklearn import preprocessing
from numpy.random import seed
from numpy.random import rand
from numpy.random import randn
from numpy import mean
from numpy import var
from math import sqrt
import re
import json

In [41]:
personal_data = pd.read_csv("Dataset/personal_train.csv")
other_data = pd.read_csv("Dataset/other_train.csv")

In [42]:
if 'Unnamed: 0' in personal_data:
    del personal_data['Unnamed: 0']
if 'Unnamed: 0' in other_data:
    del other_data['Unnamed: 0']

In [43]:
personal_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3933 entries, 0 to 3932
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           3933 non-null   object
 1   address        3933 non-null   object
 2   age            3933 non-null   int64 
 3   sex            3933 non-null   object
 4   date_of_birth  3933 non-null   object
dtypes: int64(1), object(4)
memory usage: 153.8+ KB


In [44]:
other_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3983 entries, 0 to 3982
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             3983 non-null   object 
 1   address          3983 non-null   object 
 2   kurtosis_oxygen  3973 non-null   float64
 3   occupation       3963 non-null   object 
 4   marital-status   3967 non-null   object 
 5   pregnant         3973 non-null   object 
 6   education-num    3572 non-null   float64
 7   relationship     3970 non-null   object 
 8   std_oxygen       3966 non-null   float64
 9   capital-gain     3970 non-null   float64
 10  skewness_oxygen  3963 non-null   float64
 11  education        3963 non-null   object 
 12  fnlwgt           3974 non-null   float64
 13  class            3966 non-null   float64
 14  income           3966 non-null   object 
 15  medical_info     3970 non-null   object 
 16  native-country   3973 non-null   object 
 17  capital-loss  

In [45]:
unique_medical_name_dataset = other_data.drop_duplicates('medical_info').dropna(subset=['medical_info']).drop_duplicates('name')
unique_medical_name_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3933 entries, 0 to 3982
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             3933 non-null   object 
 1   address          3933 non-null   object 
 2   kurtosis_oxygen  3929 non-null   float64
 3   occupation       3926 non-null   object 
 4   marital-status   3925 non-null   object 
 5   pregnant         3929 non-null   object 
 6   education-num    3535 non-null   float64
 7   relationship     3926 non-null   object 
 8   std_oxygen       3925 non-null   float64
 9   capital-gain     3928 non-null   float64
 10  skewness_oxygen  3925 non-null   float64
 11  education        3926 non-null   object 
 12  fnlwgt           3928 non-null   float64
 13  class            3923 non-null   float64
 14  income           3927 non-null   object 
 15  medical_info     3933 non-null   object 
 16  native-country   3931 non-null   object 
 17  capital-loss  

In [46]:
# create a dataset from 'medical_info' attribute
medical_data_objects = []
for index, record in unique_medical_name_dataset.iterrows():
    if isinstance(record['medical_info'], float):
        continue
    medical_object = json.loads(record['medical_info'].replace("\'", '\"').replace(':\"',':').replace('\",',',').replace('\"}','}'))
    medical_object['name'] = record['name']
    medical_data_objects.append(medical_object)
medical_info_dataset = pd.DataFrame(medical_data_objects)
medical_info_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3933 entries, 0 to 3932
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mean_glucose      3933 non-null   float64
 1   std_glucose       3933 non-null   float64
 2   kurtosis_glucose  3933 non-null   float64
 3   skewness_glucose  3933 non-null   float64
 4   name              3933 non-null   object 
dtypes: float64(4), object(1)
memory usage: 153.8+ KB


In [47]:
# merge datasets to create single large dataset with usefull data so it's easier to create graphs and analysis
merged_medical_info_dataset = unique_medical_name_dataset.merge(medical_info_dataset, on=['name'], how='outer').drop('medical_info', axis='columns')
usefull_dataset = personal_data.merge(merged_medical_info_dataset, on=['name', 'address'], how='outer')
usefull_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3933 entries, 0 to 3932
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3933 non-null   object 
 1   address           3933 non-null   object 
 2   age               3933 non-null   int64  
 3   sex               3933 non-null   object 
 4   date_of_birth     3933 non-null   object 
 5   kurtosis_oxygen   3929 non-null   float64
 6   occupation        3926 non-null   object 
 7   marital-status    3925 non-null   object 
 8   pregnant          3929 non-null   object 
 9   education-num     3535 non-null   float64
 10  relationship      3926 non-null   object 
 11  std_oxygen        3925 non-null   float64
 12  capital-gain      3928 non-null   float64
 13  skewness_oxygen   3925 non-null   float64
 14  education         3926 non-null   object 
 15  fnlwgt            3928 non-null   float64
 16  class             3923 non-null   float64


In [48]:
usefull_dataset['occupation'].unique()

array([' Sales', ' Handlers-cleaners', ' Craft-repair', ' Adm-clerical',
       ' Exec-managerial', ' Machine-op-inspct', ' Farming-fishing',
       ' Other-service', ' Prof-specialty', ' Prof_specialty',
       ' Protective-serv', ' Transport-moving', ' Machine_op_inspct',
       ' Other_service', ' Adm_clerical', ' ?', ' Tech_support',
       ' Exec_managerial', ' Priv-house-serv', ' Craft_repair',
       ' Protective_serv', ' Tech-support', ' Handlers_cleaners', nan,
       ' Transport_moving', ' Farming_fishing', ' Armed-Forces',
       ' Priv_house_serv'], dtype=object)

In [49]:
usefull_dataset['occupation'] = usefull_dataset['occupation'].map(lambda value: 'unknown' if type(value) is float else value.replace(' ','').replace('_', '-').replace('?', 'unknown'))
usefull_dataset['occupation'].unique()

array(['Sales', 'Handlers-cleaners', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Machine-op-inspct', 'Farming-fishing',
       'Other-service', 'Prof-specialty', 'Protective-serv',
       'Transport-moving', 'unknown', 'Tech-support', 'Priv-house-serv',
       'Armed-Forces'], dtype=object)

In [51]:
occupation_categories = usefull_dataset['occupation'].unique()

['Sales' 'Handlers-cleaners' 'Craft-repair' 'Adm-clerical'
 'Exec-managerial' 'Machine-op-inspct' 'Farming-fishing' 'Other-service'
 'Prof-specialty' 'Protective-serv' 'Transport-moving' 'unknown'
 'Tech-support' 'Priv-house-serv' 'Armed-Forces']
