In [None]:
import matplotlib.pyplot as plt
import pylab as py
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
from sklearn import preprocessing
from numpy.random import seed
from numpy.random import rand
from numpy.random import randn
from numpy import mean
from numpy import var
from math import sqrt
import re
import json

from pandasgui import show


In [None]:
personal_data = pd.read_csv("Dataset/personal_train.csv")
other_data = pd.read_csv("Dataset/other_train.csv")

## Základné informácie o datasetoch
Dataset personal_data:
* Veľkosť: 3933 záznamov
* Počet stĺpcov: 6
* Typy stĺpcov sú uvedené nižsie(získané pomocou dataset.info())
* **Tento dataset néma žiadne duplicitné či chýbajúce dáta**
* Dôležité štatistické atribúty:
*    * Vek(age)
*    * Pohlavie(sex)

Vlastnosti štatistických atribútov uvedené nižšie (získané pomocou dataset\['atribute'\].describe())

Dataset personal_data:
* Veľkosť: 3983 záznamov
* Počet stĺpcov: 23
* Typy stĺpcov sú uvedené nižsie(získané pomocou dataset.info())
* **Tento dataset obsahuje značné množstvo duplicitných či chýbajúcich dát**
* Dôležité štatistické atribúty:
*    * Krajina pôvodu(native-country)
*    * Rasa(race)
*    * Vzťahy(relationship)
*    * Priemerné O2 (mean_oxygen)
*    * Tehotenstvo (pregnant)

Vlastnosti štatistických atribútov uvedené nižšie (získané pomocou dataset\['atribute'\].describe())

Obsahom datasetu other_data je aj atribút medical_info ktorý obsahuje reťazec podobný formátu JSON s ďaľšími atribútmi ktoré môžu byť štatisticky doležité a preto bol tento reťazec extraktovaný a v spojení s atribútom name z datasetu other_data pridaný do samostatného datasetu medical_info_dataset. 

*Poznámka: Boli pridávané iba záznamy pre jedičné hodnoty atribútu name a s nenulovým atribútom medical_info*

Dataset medical_info_dataset:
* Veľkosť: 3927 záznamov
* Počet stĺpcov: 5
* Typy stĺpcov sú uvedené nižsie(získané pomocou dataset.info())

Všetky atribúty tohto datasetu sú štatisticky dôležité a ich vlastnosti sú uvedené nižsie (získané pomocou dataset\['atribute'\].describe())

Všetky vyššie spomenuté datasety sme spojili do jedného datasetu pre možnosť jednoduchšie pracovať so všetkými dôležitými dátami

Dataset usefull_dataset:
* Veľkosť: 3933 záznamov
* Počet stĺpcov: 29
* Typy stĺpcov sú uvedené nižsie(získané pomocou dataset.info())

In [None]:
personal_data.set_index(personal_data.columns.to_list()[0])
personal_data.rename(columns={personal_data.columns.to_list()[0]: "Id"}, inplace=True)

In [None]:
personal_data.head()

In [None]:
personal_data.info()

In [None]:
personal_data['age'].describe()

In [None]:
personal_data['sex'].describe()

In [None]:
other_data.set_index(other_data.columns.to_list()[0])
other_data.rename(columns={other_data.columns.to_list()[0]: "Id"}, inplace=True)

In [None]:
other_data.head()

In [None]:
other_data.info()

In [None]:
other_data['native-country'].describe()

In [None]:
other_data['race'].describe()

In [None]:
other_data['relationship'].describe()

In [None]:
other_data['mean_oxygen'].describe()

In [None]:
other_data['pregnant'].describe()

In [None]:
# Create subset with only unique names
unique_names_dataset = other_data.drop_duplicates('name')
unique_names_dataset.head()

In [None]:
# create a dataset from 'medical_info' attribute
medical_data_objects = []
for index, record in unique_names_dataset.iterrows():
    if isinstance(record['medical_info'], float):
        continue
    medical_object = json.loads(record['medical_info'].replace("\'", '\"').replace(':\"',':').replace('\",',',').replace('\"}','}'))
    medical_object['name'] = record['name']
    medical_data_objects.append(medical_object)
medical_info_dataset = pd.DataFrame(medical_data_objects)
medical_info_dataset.describe()


In [None]:
medical_info_dataset.info()

In [None]:
sns.distplot(medical_info_dataset[(medical_info_dataset['mean_glucose'] > 0) & (medical_info_dataset['mean_glucose'] < 300)].mean_glucose)

In [None]:
# names in personal_data and unique_names_dataset are equal on equal positions
personal_data['name'].isin(unique_names_dataset['name']).value_counts()

In [None]:
# merge datasets to create single large dataset with usefull data so it's easier to create graphs and analysis
merged_medical_info_dataset = unique_names_dataset.merge(medical_info_dataset, on=['name'], how='outer').drop('medical_info', axis='columns')
usefull_dataset = personal_data.merge(merged_medical_info_dataset, on=['name', 'address'], how='outer').drop('Id_y', axis='columns')
usefull_dataset.info()

### Distribúcia veku podľa pohlavia

In [None]:
# create two subsets for records of Male and Female age
male_age = usefull_dataset[(usefull_dataset['sex'] == " Male") & (usefull_dataset['age'] > 0)]
female_age = usefull_dataset[(usefull_dataset['sex'] == " Female") & (usefull_dataset['age'] > 0)]

In [None]:
# compare distribution of age of male and female dataset
sns.distplot(male_age['age'])
sns.distplot(female_age['age'])
usefull_dataset[usefull_dataset['age'] > 0].groupby(['sex'])['age'].describe()

In [None]:
# test whether male and female datasets are from distributions of equal variances
age_sex_levene_test = stats.levene(male_age['age'], female_age['age'])
print(age_sex_levene_test)

# interpret
alpha = 0.05
if age_sex_levene_test.pvalue > alpha:
    print('Equal variances (fail to reject H0)')
else:
    print('Another variances (reject H0)')

In [None]:
# test whether male and female datasets are from equal distributions
age_sex_student_ttest, p = stats.ttest_ind(male_age['age'], female_age['age'])
print('Statistics=%.3f, p=%.3f' % (age_sex_student_ttest, p))

# interpret
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')

### Ditribúcia týždenných hodín podľa pohlavia

In [None]:
female_hours = usefull_dataset[(usefull_dataset['sex'] == " Female") & (usefull_dataset['hours-per-week'] > 0)]
male_hours = usefull_dataset[(usefull_dataset['sex'] == " Male") & (usefull_dataset['hours-per-week'] > 0)]
sns.distplot(male_hours['hours-per-week'])
sns.distplot(female_hours['hours-per-week'])
usefull_dataset[usefull_dataset['hours-per-week'] > 0].groupby(['sex'])['hours-per-week'].describe()

In [None]:
# test whether male and female datasets are from distributions of equal variances
hours_sex_levene_test = stats.levene(male_hours['hours-per-week'], female_hours['hours-per-week'])
print(hours_sex_levene_test)

# interpret
alpha = 0.05
if hours_sex_levene_test.pvalue > alpha:
    print('Equal variances (fail to reject H0)')
else:
    print('Another variances (reject H0)')

In [None]:
# test whether male and female datasets are from equal distributions
hours_sex_student_ttest, p = stats.ttest_ind(male_hours['hours-per-week'], female_age['hours-per-week'])
print('Statistics=%.3f, p=%.3f' % (hours_sex_student_ttest, p))

# interpret
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')