In [None]:
import matplotlib.pyplot as plt
import pylab as py
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
from sklearn import preprocessing
from numpy.random import seed
from numpy.random import rand
from numpy.random import randn
from numpy import mean
from numpy import var
from math import sqrt
import re
import json
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import math

In [None]:
personal_data = pd.read_csv("Dataset/personal_valid.csv")
other_data = pd.read_csv("Dataset/other_valid.csv")

In [None]:
def prepare_data(personal_data, other_data):
    if 'Unnamed: 0' in personal_data:
        del personal_data['Unnamed: 0']
    if 'Unnamed: 0' in other_data:
        del other_data['Unnamed: 0']
    # merge datasets to create single large dataset with usefull data so it's easier to create graphs and analysis
    merged_medical_info_dataset = get_aggregated_data(other_data)
    usefull_dataset = personal_data.merge(merged_medical_info_dataset, on=['name', 'address'], how='outer')
    usefull_dataset = remove_nans(usefull_dataset)
    usefull_dataset = feature_reduction(usefull_dataset)
    backup_age_class = usefull_dataset[['age', 'class']]
    usefull_dataset = outlier_detection(usefull_dataset)
    usefull_dataset = transform(usefull_dataset)
    usefull_dataset['age'] = backup_age_class['age'].values
    usefull_dataset['class'] = backup_age_class['class'].values

    usefull_dataset.to_csv('Dataset/preprocessed_dataset.csv', index=False)
    return usefull_dataset


In [None]:
def get_aggregated_data(dataset):
    unique_medical_name_dataset = dataset.dropna(subset=['medical_info']).drop_duplicates('name')
    # create a dataset from 'medical_info' attribute
    medical_data_objects = []
    for index, record in unique_medical_name_dataset.iterrows():
        if isinstance(record['medical_info'], float):
            continue
        medical_object = json.loads(record['medical_info'].replace("\'", '\"').replace(':\"',':').replace('\",',',').replace('\"}','}'))
        medical_object['name'] = record['name']
        medical_data_objects.append(medical_object)
    medical_info_dataset = pd.DataFrame(medical_data_objects)
    merged_medical_info_dataset = unique_medical_name_dataset.merge(medical_info_dataset, on=['name'], how='outer').drop('medical_info', axis='columns')
    return merged_medical_info_dataset

Remove NaN values


In [None]:
def remove_nans(dataset):
    test = dataset
    test = test[test['class'].isnull()==False]
    test.replace('?',np.NaN,inplace=True)
    X = test.drop('class', axis=1)
    y = test['class']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    numeric_features = test.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = test.select_dtypes(include=['object']).columns

    num = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan,strategy='median'))])
    cat = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan,strategy='most_frequent'))])
    full = ColumnTransformer(
        transformers=[
            ('num',  num, numeric_features),
            ('cat', cat, categorical_features)])

    columns = numeric_features.tolist() + categorical_features.tolist()
    dtype = {}
    for column in columns:
        dtype[column]=(test.dtypes.to_dict()[column])

    return pd.DataFrame(full.fit_transform(test), columns=columns,index=test.index).astype(dtype)

# Feature reduction

In [None]:
def feature_reduction(dataset):
    test = dataset
    test = test[test['class'].isnull()==False]
    test.replace('?',np.NaN,inplace=True)
    X = test.drop('class', axis=1)
    y = test['class']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    numeric_features = test.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = test.select_dtypes(include=['object']).columns
    columns_to_remove = ['name', 'education-num', 'capital-gain', 'capital-loss', 'address', 'date_of_birth', 'occupation', 'marital-status','relationship', 'education', 'native-country', 'workclass', 'income', 'race']

    dataset['pregnant'] = dataset['pregnant'].apply(lambda value: 0 if re.search('T', value, re.IGNORECASE) else 1)
    # manualne odstranenie stlpcov ktore podla nas nemaju vplyv na vyslednu hodnotu (vacsinou kategoricke atributy)
    dataset_for_UST = dataset.drop(columns_to_remove, axis=1, errors='ignore')
    columns_to_remove.append('pregnant')
    categorical_features = categorical_features.drop(columns_to_remove, errors='ignore')

    dataset_for_UST = pd.get_dummies(dataset_for_UST, columns=categorical_features)

    # Feature Selection with Univariate Statistical Tests
    # pouzijeme statisticke tety aby sme vybrali K najlepsich atributov ktore najviac vplyvaju na vysledok

    UST_X = dataset_for_UST.drop('class', axis=1)
    UST_y = dataset_for_UST['class']
    # feature extraction
    test = SelectKBest(score_func=f_classif, k=13)
    fit = test.fit(UST_X, UST_y)
    # summarize scores
    #vidime ze niektore atributy maju miniaturny vplyv na vysledok priam az zanedbatelny tak ich vyhodime aby sme zrychlili vypocet modelu
    print(fit.scores_)
    features_columns = {}
    for i in range(len(fit.scores_)):
        features_columns[UST_X.columns[i]] = fit.scores_[i]
    columns_by_rank = {k: features_columns[k] for k in sorted(features_columns, key=features_columns.get, reverse=True)}
    columns_to_remove = list(columns_by_rank.keys())[13-(len(features_columns)):]
    return dataset_for_UST.drop(columns_to_remove, axis=1, errors='ignore')

# Odstranenie outlierov
zvolili sme metodu IRQ na detekciu outlierov a hodnoty ktore su outlieri nahradime krajnymi hodnotami

In [None]:
def outlier_detection(dataset):
    for column in dataset.columns:
        Q1 = dataset[column].quantile(0.25)
        Q3 = dataset[column].quantile(0.75)

        IRQ = Q3 - Q1

        lower_bound = Q1 - (1.5*IRQ)
        upper_bound = Q3 + (1.5*IRQ)
        if(dataset[column].dtype == 'int64'):
            dataset[column] = dataset[column].apply(lambda val: math.floor(lower_bound) if val < lower_bound else math.floor(upper_bound) if val > upper_bound else val)
        else:
            dataset[column] = dataset[column].apply(lambda val: lower_bound if val < lower_bound else upper_bound if val > upper_bound else val)
        
        return dataset

# Data transofrmation

In [None]:
def transform(dataset):
    scaler = RobustScaler()
    scaled = scaler.fit_transform(dataset)
    dataset = pd.DataFrame(scaled, columns=dataset.columns)

    power = PowerTransformer(method='yeo-johnson', standardize=True) 
    data_trans = power.fit_transform(dataset)
    return pd.DataFrame(data_trans, columns=dataset.columns)

In [None]:
dataset = prepare_data(personal_data, other_data)
dataset.info()

In [None]:
for column in dataset.columns:
    skewness = stats.skew(dataset[column])
    tab = 20 - len(column)
    if skewness <= 0.5 and skewness >= -0.5:
        print(column,' ' * tab, 'symmetrical    ', skewness)
    elif skewness < -0.5:
        print(column,' ' * tab, 'negative skew  ', skewness)
    else:
        print(column,' ' * tab, 'positive skew  ', skewness)

for column in dataset.columns:
    kurtosis = stats.kurtosis(dataset[column])
    tab = 20 - len(column)
    if kurtosis <= 3.5 and kurtosis >= 2.5:
        print(column,' ' * tab, 'symmetrical    ', skewness)
    elif skewness < 2.5:
        print(column,' ' * tab, 'in middle      ', skewness)
    else:
        print(column,' ' * tab, 'on outer       ', skewness)

# Prieskumna analyza
2 Datasety sme spojili do jedneho a odstranili sme duplicitne hodnoty a nulove hodnoty pre atribut class ktory je pre nas klucovy a nemozeme ho doplnit

Dalej sme zredukovali pocet atributov na take ktore najviac kontribuuju k vysledku a cyhodili take ktore pravdepodobne vobec nekontribuuju.
¨
Data sme taktiez transformovali aby sa viac podobali normalnej distribucii a pochadzali z podobnych rozsahov hodnot.

Tieto transformacie zmenili vyzor dat a v nasledujucej sekcii sa pozrieme na to ako sa zmenili distribucie a dalsie statisticke atributy dat.

In [None]:
dataset.info()

In [None]:
dataset['age'].describe()

In [None]:
dataset['kurtosis_oxygen'].describe()

In [None]:
dataset['std_oxygen'].describe()

In [None]:
dataset['skewness_oxygen'].describe()

In [None]:
dataset['mean_oxygen'].describe()

In [None]:
dataset['std_glucose'].describe()

In [None]:
dataset['mean_glucose'].describe()

In [None]:
dataset['kurtosis_glucose'].describe()

In [None]:
dataset['skewness_glucose'].describe()

Mozeme si vsimnut ze vsetky atributy maju velmi pdobne rozsahy ale velmi rozdielne priemery avsak podobne standardne distribucie. toto je vysledok transformacii

In [None]:
sns.pairplot(dataset, hue='class')

### Parova analyza a zistovanie zavislosti veku(age) a sklonu glukozy(skewness_glucose)

In [None]:
sm.qqplot(dataset['age'], line='s')

In [None]:
sm.qqplot(dataset['skewness_glucose'], line='s')

In [None]:
sns.regplot(x=dataset['age'], y=dataset['skewness_glucose'])

In [None]:
dataset.corr()

In [None]:
dataset.cov()

In [None]:
corr, _ = stats.pearsonr(dataset['age'], dataset['skewness_glucose']) 
print('Pearsons correlation: %.3f' % corr)

In [None]:
covariance = np.cov(dataset['age'], dataset['skewness_glucose'])[0, 1]
print(covariance)

pre atributy age a skewness glucose sa nam corelacia takmer nezmenila no kovariancia je mensia

In [None]:
from scipy.stats import kruskal

stat, p =kruskal(dataset['age'], dataset['skewness_glucose'])

print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

In [None]:
shapiro_test = stats.shapiro(dataset['age'])
print(shapiro_test)

# interpret
alpha = 0.05
if shapiro_test.pvalue > alpha:
    print('Normal distribution (fail to reject H0)')
else:
    print('Another distributions (reject H0)')

In [None]:
shapiro_test = stats.shapiro(dataset['skewness_glucose'])
print(shapiro_test)

# interpret
alpha = 0.05
if shapiro_test.pvalue > alpha:
    print('Normal distribution (fail to reject H0)')
else:
    print('Another distributions (reject H0)')

Parova analyza mean_glucose a mean_oxygen

In [None]:
corr, _ = stats.pearsonr(dataset['mean_oxygen'], dataset['mean_glucose']) 
print('Pearsons correlation: %.3f' % corr)

In [None]:
covariance = np.cov(dataset['mean_oxygen'], dataset['mean_glucose'])[0, 1]
print(covariance)

In [None]:
from scipy.stats import kruskal

stat, p =kruskal(dataset['mean_oxygen'], dataset['mean_glucose'])

print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

In [None]:
shapiro_test = stats.shapiro(dataset['mean_oxygen'])
print(shapiro_test)

# interpret
alpha = 0.05
if shapiro_test.pvalue > alpha:
    print('Normal distribution (fail to reject H0)')
else:
    print('Another distributions (reject H0)')

In [None]:
shapiro_test = stats.shapiro(dataset['mean_glucose'])
print(shapiro_test)

# interpret
alpha = 0.05
if shapiro_test.pvalue > alpha:
    print('Normal distribution (fail to reject H0)')
else:
    print('Another distributions (reject H0)')