In [None]:
import matplotlib.pyplot as plt
import pylab as py
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
from sklearn import preprocessing
from numpy.random import seed
from numpy.random import rand
from numpy.random import randn
from numpy import mean
from numpy import var
from math import sqrt
import re
import json

In [None]:
personal_data = pd.read_csv("Dataset/personal_train.csv")
other_data = pd.read_csv("Dataset/other_train.csv")

In [None]:
if 'Unnamed: 0' in personal_data:
    del personal_data['Unnamed: 0']
if 'Unnamed: 0' in other_data:
    del other_data['Unnamed: 0']

In [None]:
personal_data.info()

In [None]:
other_data.info()

In [None]:
unique_medical_name_dataset = other_data.dropna(subset=['medical_info']).drop_duplicates('name')
unique_medical_name_dataset.info()

In [None]:
# create a dataset from 'medical_info' attribute
medical_data_objects = []
for index, record in unique_medical_name_dataset.iterrows():
    if isinstance(record['medical_info'], float):
        continue
    medical_object = json.loads(record['medical_info'].replace("\'", '\"').replace(':\"',':').replace('\",',',').replace('\"}','}'))
    medical_object['name'] = record['name']
    medical_data_objects.append(medical_object)
medical_info_dataset = pd.DataFrame(medical_data_objects)
medical_info_dataset.info()

In [None]:
# merge datasets to create single large dataset with usefull data so it's easier to create graphs and analysis
merged_medical_info_dataset = unique_medical_name_dataset.merge(medical_info_dataset, on=['name'], how='outer').drop('medical_info', axis='columns')
usefull_dataset = personal_data.merge(merged_medical_info_dataset, on=['name', 'address'], how='outer')
usefull_dataset.info()

In [None]:
usefull_dataset['occupation'].unique()

In [None]:
usefull_dataset['occupation'] = usefull_dataset['occupation'].map(lambda value: 'unknown' if type(value) is float else value.replace(' ','').replace('_', '-').replace('?', 'unknown'))
usefull_dataset['occupation'].unique()

In [None]:
occupation_categories = usefull_dataset['occupation'].unique()

NAN to values

In [578]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [579]:
# test = usefull_dataset.drop(['name','address','date_of_birth','occupation','marital-status','education-num','relationship','capital-gain','education','income','capital-loss','hours-per-week','workclass'],axis=1)

test = usefull_dataset
test = test[test['class'].isnull()==False]
test.replace('?',np.NaN,inplace=True)


In [580]:
X = test.drop('class', axis=1)
y = test['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.info(),X_test.info())
print(y_train.count(),y_test.count())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3138 entries, 3020 to 1457
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3138 non-null   object 
 1   address           3138 non-null   object 
 2   age               3138 non-null   int64  
 3   sex               3138 non-null   object 
 4   date_of_birth     3138 non-null   object 
 5   kurtosis_oxygen   3137 non-null   float64
 6   occupation        3138 non-null   object 
 7   marital-status    3133 non-null   object 
 8   pregnant          3135 non-null   object 
 9   education-num     2806 non-null   float64
 10  relationship      3135 non-null   object 
 11  std_oxygen        3132 non-null   float64
 12  capital-gain      3134 non-null   float64
 13  skewness_oxygen   3133 non-null   float64
 14  education         3134 non-null   object 
 15  fnlwgt            3135 non-null   float64
 16  income            3134 non-null   objec

In [581]:
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [582]:
numeric_features = test.select_dtypes(include=['int64', 'float64']).columns
categorical_features = test.select_dtypes(include=['object']).columns

print(numeric_features.tolist())
print(categorical_features.tolist())

['age', 'kurtosis_oxygen', 'education-num', 'std_oxygen', 'capital-gain', 'skewness_oxygen', 'fnlwgt', 'class', 'capital-loss', 'mean_oxygen', 'hours-per-week', 'mean_glucose', 'std_glucose', 'kurtosis_glucose', 'skewness_glucose']
['name', 'address', 'sex', 'date_of_birth', 'occupation', 'marital-status', 'pregnant', 'relationship', 'education', 'income', 'native-country', 'race', 'workclass']


In [583]:
num = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan,strategy='median'))])
cat = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan,strategy='most_frequent'))])
full = ColumnTransformer(
    transformers=[
        ('num',  num, numeric_features),
        ('cat', cat, categorical_features)])

# print(pd.DataFrame(full.fit_transform(test), columns=numeric_features.tolist() + categorical_features.tolist(),index=test.index))
test2 = pd.DataFrame(full.fit_transform(test), columns=numeric_features.tolist() + categorical_features.tolist(),index=test.index)
test2[test2.isnull().any(axis=1)]
test2

Unnamed: 0,age,kurtosis_oxygen,education-num,std_oxygen,capital-gain,skewness_oxygen,fnlwgt,class,capital-loss,mean_oxygen,...,date_of_birth,occupation,marital-status,pregnant,relationship,education,income,native-country,race,workclass
0,58,8.32891,10,18.4415,0,78.6751,109570,0,0,2.79515,...,18/01/1961,Sales,Separated,f,Unmarried,Some-college,<=50K,United-States,White,Private
2,52,3.37664,9,48.6771,0,10.2908,188064,1,0,14.7299,...,1967-04-18,Craft-repair,Married-civ-spouse,f,Husband,HS-grad,<=50K,Canada,White,Private
3,64,4.26449,10,34.3017,0,17.8685,111128,0,0,8.44398,...,55-09-04,Craft-repair,Married-civ-spouse,f,Husband,Some-college,<=50K,United-States,White,Private
4,66,7.6418,10,22.9657,0,62.8487,177277,0,0,3.63963,...,1952-12-05,Craft-repair,Never-married,f,Not-in-family,Some-college,<=50K,United-States,White,Self-emp-not-inc
5,64,0.263897,9,75.4896,0,-0.848741,191389,1,0,111.107,...,1955-07-05,Adm-clerical,Divorced,f,Unmarried,HS-grad,<=50K,United-States,White,Local-gov
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3928,45,7.81027,10,20.8528,0,67.0404,116338,0,653,3.06187,...,1974-04-16,Prof-specialty,Separated,f,Unmarried,HS-grad,<=50K,United-States,White,Private
3929,48,12.5741,10,12.5457,0,68.4768,179580,1,1762,1.67809,...,1971-10-18,unknown,Never-married,f,Husband,Some-college,<=50K,United-States,White,Private
3930,83,2.04191,9,62.0302,4386,2.66409,133219,1,0,27.6706,...,36-04-15,Other-service,Married-civ-spouse,f,Husband,HS-grad,>50K,United-States,Black,Private
3931,49,10.2313,10,17.9531,0,114.954,160893,0,0,2.30686,...,1970-05-06,Other-service,Married-civ-spouse,f,Wife,Some-college,<=50K,United-States,White,Private


In [584]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num',  numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)])

In [585]:
# from sklearn.ensemble import RandomForestClassifier
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', RandomForestClassifier())])

In [586]:
# pipeline.fit(X_train,y_train)

In [587]:
# y_pred = pipeline.predict(test)
# print(y_pred)

In [588]:
# from sklearn.metrics import accuracy_score, log_loss
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC, LinearSVC, NuSVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# classifiers = [
#     KNeighborsClassifier(3),
#     SVC(kernel="rbf", C=0.025, probability=True),
#     NuSVC(probability=True),
#     DecisionTreeClassifier(),
#     RandomForestClassifier(),
#     AdaBoostClassifier(),
#     GradientBoostingClassifier()
#     ]
# for classifier in classifiers:
#     pipe = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('classifier', classifier)])
#     pipe.fit(X_train, y_train)   
#     print(classifier)
#     print("model score: %.3f" % pipe.score(X_test, y_test))

In [589]:
# param_grid = { 
#     'classifier__n_estimators': [200, 500],
#     'classifier__max_features': ['auto', 'sqrt', 'log2'],
#     'classifier__max_depth' : [4,5,6,7,8],
#     'classifier__criterion' :['gini', 'entropy']}
# from sklearn.model_selection import GridSearchCV
# CV = GridSearchCV(rf, param_grid, n_jobs= 1)
                  
# CV.fit(X_train, y_train)  
# print(CV.best_params_)    
# print(CV.best_score_)