In [966]:
import matplotlib.pyplot as plt
import pylab as py
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
from sklearn import preprocessing
from numpy.random import seed
from numpy.random import rand
from numpy.random import randn
from numpy import mean
from numpy import var
from math import sqrt
import re
import json
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [967]:
personal_data = pd.read_csv("Dataset/personal_train.csv")
other_data = pd.read_csv("Dataset/other_train.csv")

In [968]:
if 'Unnamed: 0' in personal_data:
    del personal_data['Unnamed: 0']
if 'Unnamed: 0' in other_data:
    del other_data['Unnamed: 0']

In [969]:
unique_medical_name_dataset = other_data.dropna(subset=['medical_info']).drop_duplicates('name')
unique_medical_name_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3933 entries, 0 to 3982
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             3933 non-null   object 
 1   address          3933 non-null   object 
 2   kurtosis_oxygen  3929 non-null   float64
 3   occupation       3926 non-null   object 
 4   marital-status   3925 non-null   object 
 5   pregnant         3929 non-null   object 
 6   education-num    3535 non-null   float64
 7   relationship     3926 non-null   object 
 8   std_oxygen       3925 non-null   float64
 9   capital-gain     3928 non-null   float64
 10  skewness_oxygen  3925 non-null   float64
 11  education        3926 non-null   object 
 12  fnlwgt           3928 non-null   float64
 13  class            3923 non-null   float64
 14  income           3927 non-null   object 
 15  medical_info     3933 non-null   object 
 16  native-country   3931 non-null   object 
 17  capital-loss  

In [970]:
# create a dataset from 'medical_info' attribute
medical_data_objects = []
for index, record in unique_medical_name_dataset.iterrows():
    if isinstance(record['medical_info'], float):
        continue
    medical_object = json.loads(record['medical_info'].replace("\'", '\"').replace(':\"',':').replace('\",',',').replace('\"}','}'))
    medical_object['name'] = record['name']
    medical_data_objects.append(medical_object)
medical_info_dataset = pd.DataFrame(medical_data_objects)
medical_info_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3933 entries, 0 to 3932
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mean_glucose      3933 non-null   float64
 1   std_glucose       3933 non-null   float64
 2   kurtosis_glucose  3933 non-null   float64
 3   skewness_glucose  3933 non-null   float64
 4   name              3933 non-null   object 
dtypes: float64(4), object(1)
memory usage: 153.8+ KB


In [971]:
# merge datasets to create single large dataset with usefull data so it's easier to create graphs and analysis
merged_medical_info_dataset = unique_medical_name_dataset.merge(medical_info_dataset, on=['name'], how='outer').drop('medical_info', axis='columns')
usefull_dataset = personal_data.merge(merged_medical_info_dataset, on=['name', 'address'], how='outer')
usefull_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3933 entries, 0 to 3932
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3933 non-null   object 
 1   address           3933 non-null   object 
 2   age               3933 non-null   int64  
 3   sex               3933 non-null   object 
 4   date_of_birth     3933 non-null   object 
 5   kurtosis_oxygen   3929 non-null   float64
 6   occupation        3926 non-null   object 
 7   marital-status    3925 non-null   object 
 8   pregnant          3929 non-null   object 
 9   education-num     3535 non-null   float64
 10  relationship      3926 non-null   object 
 11  std_oxygen        3925 non-null   float64
 12  capital-gain      3928 non-null   float64
 13  skewness_oxygen   3925 non-null   float64
 14  education         3926 non-null   object 
 15  fnlwgt            3928 non-null   float64
 16  class             3923 non-null   float64


Remove NaN values


In [972]:
# test = usefull_dataset.drop(['name','address','date_of_birth','occupation','marital-status','education-num','relationship','capital-gain','education','income','capital-loss','hours-per-week','workclass'],axis=1)

test = usefull_dataset
test = test[test['class'].isnull()==False]
test.replace('?',np.NaN,inplace=True)

In [973]:
X = test.drop('class', axis=1)
y = test['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.info(),X_test.info())
print(y_train.count(),y_test.count())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3138 entries, 2111 to 2551
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3138 non-null   object 
 1   address           3138 non-null   object 
 2   age               3138 non-null   int64  
 3   sex               3138 non-null   object 
 4   date_of_birth     3138 non-null   object 
 5   kurtosis_oxygen   3136 non-null   float64
 6   occupation        3132 non-null   object 
 7   marital-status    3133 non-null   object 
 8   pregnant          3135 non-null   object 
 9   education-num     2818 non-null   float64
 10  relationship      3135 non-null   object 
 11  std_oxygen        3134 non-null   float64
 12  capital-gain      3134 non-null   float64
 13  skewness_oxygen   3133 non-null   float64
 14  education         3133 non-null   object 
 15  fnlwgt            3134 non-null   float64
 16  income            3133 non-null   objec

In [974]:
numeric_features = test.select_dtypes(include=['int64', 'float64']).columns
categorical_features = test.select_dtypes(include=['object']).columns

print(numeric_features.tolist())
print(categorical_features.tolist())

['age', 'kurtosis_oxygen', 'education-num', 'std_oxygen', 'capital-gain', 'skewness_oxygen', 'fnlwgt', 'class', 'capital-loss', 'mean_oxygen', 'hours-per-week', 'mean_glucose', 'std_glucose', 'kurtosis_glucose', 'skewness_glucose']
['name', 'address', 'sex', 'date_of_birth', 'occupation', 'marital-status', 'pregnant', 'relationship', 'education', 'income', 'native-country', 'race', 'workclass']


In [975]:
num = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan,strategy='median'))])
cat = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan,strategy='most_frequent'))])
full = ColumnTransformer(
    transformers=[
        ('num',  num, numeric_features),
        ('cat', cat, categorical_features)])

columns = numeric_features.tolist() + categorical_features.tolist()
dtype = {}
for column in columns:
    dtype[column]=(test.dtypes.to_dict()[column])

test2 = pd.DataFrame(full.fit_transform(test), columns=columns,index=test.index).astype(dtype)
test2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3923 entries, 0 to 3932
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               3923 non-null   int64  
 1   kurtosis_oxygen   3923 non-null   float64
 2   education-num     3923 non-null   float64
 3   std_oxygen        3923 non-null   float64
 4   capital-gain      3923 non-null   float64
 5   skewness_oxygen   3923 non-null   float64
 6   fnlwgt            3923 non-null   float64
 7   class             3923 non-null   float64
 8   capital-loss      3923 non-null   float64
 9   mean_oxygen       3923 non-null   float64
 10  hours-per-week    3923 non-null   float64
 11  mean_glucose      3923 non-null   float64
 12  std_glucose       3923 non-null   float64
 13  kurtosis_glucose  3923 non-null   float64
 14  skewness_glucose  3923 non-null   float64
 15  name              3923 non-null   object 
 16  address           3923 non-null   object 


# Feature reduction

In [976]:
columns_to_remove = ['name', 'education-num', 'capital-gain', 'capital-loss', 'address', 'date_of_birth', 'occupation', 'marital-status','relationship', 'education', 'native-country', 'workclass', 'income', 'race']

test2['pregnant'] = test2['pregnant'].apply(lambda value: 0 if re.search('T', value, re.IGNORECASE) else 1)
# manualne odstranenie stlpcov ktore podla nas nemaju vplyv na vyslednu hodnotu (vacsinou kategoricke atributy)
dataset_for_UST = test2.drop(columns_to_remove, axis=1, errors='ignore')
columns_to_remove.append('pregnant')
categorical_features = categorical_features.drop(columns_to_remove, errors='ignore')

dataset_for_UST = pd.get_dummies(dataset_for_UST, columns=categorical_features)
dataset_for_UST.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3923 entries, 0 to 3932
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               3923 non-null   int64  
 1   kurtosis_oxygen   3923 non-null   float64
 2   std_oxygen        3923 non-null   float64
 3   skewness_oxygen   3923 non-null   float64
 4   fnlwgt            3923 non-null   float64
 5   class             3923 non-null   float64
 6   mean_oxygen       3923 non-null   float64
 7   hours-per-week    3923 non-null   float64
 8   mean_glucose      3923 non-null   float64
 9   std_glucose       3923 non-null   float64
 10  kurtosis_glucose  3923 non-null   float64
 11  skewness_glucose  3923 non-null   float64
 12  pregnant          3923 non-null   int64  
 13  sex_ Female       3923 non-null   uint8  
 14  sex_ Male         3923 non-null   uint8  
dtypes: float64(11), int64(2), uint8(2)
memory usage: 436.7 KB


In [977]:
# Feature Selection with Univariate Statistical Tests
# pouzijeme statisticke tety aby sme vybrali K najlepsich atributov ktore najviac vplyvaju na vysledok
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

UST_X = dataset_for_UST.drop('class', axis=1)
UST_y = dataset_for_UST['class']

# feature extraction
test = SelectKBest(score_func=f_classif, k=10)
fit = test.fit(UST_X, UST_y)
# summarize scores
#vidime ze niektore atributy maju miniaturny vplyv na vysledok priam az zanedbatelny tak ich vyhodime aby sme zrychlili vypocet modelu
set_printoptions(precision=3)
print(fit.scores_)
features_columns = {}
for i in range(len(fit.scores_)):
    features_columns[UST_X.columns[i]] = fit.scores_[i]
columns_by_rank = {k: features_columns[k] for k in sorted(features_columns, key=features_columns.get, reverse=True)}
columns_to_remove = list(columns_by_rank.keys())[-4:]
reduced_dataset = dataset_for_UST.drop(columns_to_remove, axis=1, errors='ignore')
reduced_dataset.info()

[6.021e+02 1.762e+03 2.829e+03 7.112e+02 1.369e+00 1.428e+03 2.048e+00
 1.382e+00 1.318e+03 6.465e+03 3.406e+03 3.278e-01 5.135e-02 5.135e-02]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3923 entries, 0 to 3932
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               3923 non-null   int64  
 1   kurtosis_oxygen   3923 non-null   float64
 2   std_oxygen        3923 non-null   float64
 3   skewness_oxygen   3923 non-null   float64
 4   class             3923 non-null   float64
 5   mean_oxygen       3923 non-null   float64
 6   hours-per-week    3923 non-null   float64
 7   mean_glucose      3923 non-null   float64
 8   std_glucose       3923 non-null   float64
 9   kurtosis_glucose  3923 non-null   float64
 10  skewness_glucose  3923 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 367.8 KB


In [978]:
reduced_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3923 entries, 0 to 3932
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               3923 non-null   int64  
 1   kurtosis_oxygen   3923 non-null   float64
 2   std_oxygen        3923 non-null   float64
 3   skewness_oxygen   3923 non-null   float64
 4   class             3923 non-null   float64
 5   mean_oxygen       3923 non-null   float64
 6   hours-per-week    3923 non-null   float64
 7   mean_glucose      3923 non-null   float64
 8   std_glucose       3923 non-null   float64
 9   kurtosis_glucose  3923 non-null   float64
 10  skewness_glucose  3923 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 367.8 KB


# Odstranenie outlierov
zvolili sme metodu IRQ na detekciu outlierov a hodnoty ktore su outlieri nahradime krajnymi hodnotami

In [979]:
import math
for column in reduced_dataset.columns:
    Q1 = reduced_dataset[column].quantile(0.25)
    Q3 = reduced_dataset[column].quantile(0.75)

    IRQ = Q3 - Q1

    lower_bound = Q1 - (1.5*IRQ)
    upper_bound = Q3 + (1.5*IRQ)
    if(reduced_dataset[column].dtype == 'int64'):
        reduced_dataset[column] = reduced_dataset[column].apply(lambda val: math.floor(lower_bound) if val < lower_bound else math.floor(upper_bound) if val > upper_bound else val)
    else:
        reduced_dataset[column] = reduced_dataset[column].apply(lambda val: lower_bound if val < lower_bound else upper_bound if val > upper_bound else val)

In [980]:
#reduced_dataset.info()
backup_age_class = reduced_dataset[['age', 'class']]
backup_age_class.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3923 entries, 0 to 3932
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     3923 non-null   int64  
 1   class   3923 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 91.9 KB


# Data transofrmation

In [981]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaled = scaler.fit_transform(reduced_dataset)
reduced_dataset = pd.DataFrame(scaled, columns=reduced_dataset.columns)
reduced_dataset.head()

Unnamed: 0,age,kurtosis_oxygen,std_oxygen,skewness_oxygen,class,mean_oxygen,hours-per-week,mean_glucose,std_glucose,kurtosis_glucose,skewness_glucose
0,0.375,0.082247,-0.075426,0.089498,0.0,-0.041283,-1.0,-0.496101,-0.021007,0.130655,0.088959
1,0.0,-0.663888,0.997205,-0.510628,1.0,1.085448,-1.0,-1.263178,-1.489166,2.162239,2.275891
2,0.75,-0.53012,0.487227,-0.444128,0.0,0.492008,1.0,0.012966,-0.183808,0.194317,0.15898
3,0.875,-0.021277,0.085073,-0.049392,0.0,0.038442,0.0,0.098656,1.202835,-0.059191,-0.425245
4,0.75,-1.13287,1.948401,-0.608387,1.0,2.389726,-1.0,1.858358,-0.22169,2.162239,2.275891


In [982]:
for column in reduced_dataset.columns:
    skewness = stats.skew(reduced_dataset[column])
    tab = 20 - len(column)
    if skewness <= 0.5 and skewness >= -0.5:
        print(column,' ' * tab, 'symmetrical    ', skewness)
    elif skewness < -0.5:
        print(column,' ' * tab, 'negative skew  ', skewness)
    else:
        print(column,' ' * tab, 'positive skew  ', skewness)

for column in reduced_dataset.columns:
    kurtosis = stats.kurtosis(reduced_dataset[column])
    tab = 20 - len(column)
    if kurtosis <= 3.5 and kurtosis >= 2.5:
        print(column,' ' * tab, 'symmetrical    ', skewness)
    elif skewness < 2.5:
        print(column,' ' * tab, 'in middle      ', skewness)
    else:
        print(column,' ' * tab, 'on outer       ', skewness)

age                   symmetrical     -0.11261830459291686
kurtosis_oxygen       symmetrical     0.26435737046762103
std_oxygen            positive skew   1.1391044303230098
skewness_oxygen       positive skew   1.0411017300995582
class                 positive skew   1.1201604135483203
mean_oxygen           positive skew   1.1759869645362848
hours-per-week        symmetrical     0.44295735333985503
mean_glucose          symmetrical     -0.3820634557028439
std_glucose           symmetrical     -0.21922145316305633
kurtosis_glucose      positive skew   0.9002970201047747
skewness_glucose      positive skew   1.1098057557232073
age                   in middle       1.1098057557232073
kurtosis_oxygen       in middle       1.1098057557232073
std_oxygen            in middle       1.1098057557232073
skewness_oxygen       in middle       1.1098057557232073
class                 in middle       1.1098057557232073
mean_oxygen           in middle       1.1098057557232073
hours-per-week        in

In [984]:
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='yeo-johnson', standardize=True) 
data_trans = power.fit_transform(reduced_dataset)
reduced_dataset = pd.DataFrame(data_trans, columns=reduced_dataset.columns)
reduced_dataset['age'] = backup_age_class['age'].values
reduced_dataset['class'] = backup_age_class['class'].values
reduced_dataset.head()

Unnamed: 0,age,kurtosis_oxygen,std_oxygen,skewness_oxygen,class,mean_oxygen,hours-per-week,mean_glucose,std_glucose,kurtosis_glucose,skewness_glucose
0,58,0.250632,-0.343399,0.253157,0.0,-0.388419,-1.03652,-0.373083,0.016463,0.047842,0.04649
1,52,-0.857126,1.111366,-1.087999,1.0,1.159388,-1.03652,-1.074306,-1.821688,1.540945,1.471494
2,64,-0.651324,0.722082,-0.895583,0.0,0.806858,0.69707,0.143306,-0.205758,0.124239,0.169382
3,66,0.103456,0.104006,0.018476,0.0,-0.046635,-0.063748,0.236123,1.879615,-0.206186,-1.554265
4,64,-1.598549,1.461677,-1.391921,1.0,1.405724,-1.03652,2.452943,-0.256553,1.540945,1.471494
