In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

# For data preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For classification model
# 'lr', 'rf', 'lightgbm', 'gbc', 'xgboost'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For deep learning model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC

# For evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, auc

In [27]:
# Set display options to show all columns
pd.set_option('display.max_columns', None)
# Ignore warnings
warnings.filterwarnings('ignore')

In [28]:
data = pd.read_csv("final_cleaned.csv")
country_data = pd.read_csv("country_eu.csv")

In [29]:
country_data = country_data.rename(columns={'country': 'country_eu'})

In [30]:
data = pd.concat([country_data, data], axis=1)
data

Unnamed: 0,country_eu,mergeid,age,country,gender,edu_lv,marital,migration,citizen,religion,region,residence_type,network_size,integration,closeness,network_satisfaction,children,grandchildren,look_after_grand,home_own,mortgage,health_self,chronic,vision_d,vision_n,hearing,level_of_pain,glasses,hearing_aid,cane,walker,manual_wheelchair,eletric_wheelchair,buggy,utensils,personal_alarm,bar,raised_toilet,incontinence,heart,hypertension,cholesterol,stroke,diabetes,lung_disease,cancer,ulcer,parkinsons,cataracts,fracture,dementia,psycho,arthritis,kidney,depression,pessimism,suicidality,guilt,sleep,interest,irritability,appetite,fatigue,concentration,enjoyment,tearfulness,eurod_categ,limit_activity,limit_work,adls,iadls,mobility,uppermo,lowermo,lgmuscle,grossmo,finemo,fall,date_day,memory,imword_recall,deword_recall,serial_7,verbal,inactivity,drinking,smoking,dairy,legeggs,meat,fritsveg,outpatient,vaccination,eye_exam,mammogram,colon,dental,inpatient,other_facility,forgo_cost,forgo_available,health_literacy,medicine,polypharmacy,satisfied_insurance,supplementary_insurance,home_care,personal_care_hour,domestic_task_hour,meal_week,nursing_home,nursing_home_week,receive_help_freq,given_help_freq,activity_help,help_meet_need,income_work,income_self_work,hh_income,economic_status,income_house,long_term,bond_stock_funds,bank_account,value_house,value_business,value_car,oldage_pension,occupational_pension,disability_pension,unemployment_benefits,social_assistance,sickness_benefit,private_pensions,private_transfer,owe,given_gift_250,given_gift_5000,receive_gift_250,receive_gift_5000,rent_expenditure,hh_cosumption,unafford_food,make_ends_meet,hh_real_asset,hh_gross_asset,hh_net_asset,hh_net_worth,job,paidwork,other_job,employment,hours_work,number_social,voluntary,course,club,organization,read,games,card,social_freq,companionship,leftout,isolated,lonely,satisfied_life,casp_12,extraversion,aggreeableness,conscientiousness,neuroticism,openess,bmi,bmi_cate,overweight
0,11,AT-000787-01,58,5,0,2,1,0,1,0,0,1,7,4,4.00,10,0,0,non_applicable,1,0.0,2,0,2,1,2,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,5,5,5,27,0,2,2,1,3,5,1,3,1,0,non_applicable,1,1,0,0,0,0,5,0,0,1,1,0,0,0,0,0,0,0,1,non_applicable,non_applicable,60000.00000,25000.0,108000.00000,1,0.0,94341.5742,339774.1648,6000.0,1400000.0,406763.9454,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0.0,9600.0,0,1,1816763.976,440115.739,440115.739,2256879.715,2,non_applicable,1,2,60,2,0,1,0,0,1,0,0,2.500000,3,3,3,3,10,44,2.0,3.0,4.5,1.5,5.0,24.15,2,0
1,11,AT-001492-01,70,5,1,1,1,0,1,1,0,2,4,4,4.00,10,6,13,1,2,0.0,3,0,3,3,3,2,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,4,2,7,5,5,26,0,1,2,1,3,4,1,7,1,1,1,0,1,0,0,0,0,5,1,0,3,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,0.00000,0.0,27600.00000,1,0.0,28000.0000,0.0000,14000.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,10680.0,6600.0,0,0,0.000,42000.000,42000.000,42000.000,5,0,non_applicable,non_applicable,0,4,1,0,1,0,1,0,1,2.250000,3,3,3,3,8,39,5.0,3.5,3.5,4.5,3.5,21.05,2,0
2,11,AT-001492-02,71,5,0,1,1,0,1,1,0,2,4,4,3.25,9,6,13,1,non_applicable,0.0,1,1,3,2,3,2,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,4,3,6,6,5,22,0,1,3,1,3,2,2,5,1,1,non_applicable,0,0,0,0,0,0,5,1,1,2,0,0,0,0,0,0,0,0,0,0,non_applicable,0.00000,0.0,27600.00000,1,0.0,28000.0000,0.0000,14000.0,0.0,0.0000,0.0,32200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,10680.0,6600.0,non_applicable,0,0.000,42000.000,42000.000,42000.000,1,0,non_applicable,non_applicable,0,4,1,0,1,0,1,0,1,2.750000,3,3,3,3,8,46,3.5,5.0,5.0,1.0,3.0,22.69,2,0
3,11,AT-001719-01,60,5,0,1,1,1,0,1,0,1,4,4,3.75,9,6,9,1,2,0.0,5,1,2,2,2,3,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,4,3,4,3,5,12,0,1,3,1,2,5,1,12,0,0,non_applicable,0,1,0,0,0,0,3,1,1,2,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,11181.92532,0.0,22957.80642,1,0.0,0.0000,0.0000,7000.0,0.0,0.0000,12000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3000.0,0,0,0,0,10800.0,3600.0,0,0,12000.000,7000.000,4000.000,16000.000,2,non_applicable,0,1,25,1,0,0,0,0,0,0,1,3.000000,3,3,3,3,7,39,4.5,4.0,3.0,1.5,1.5,38.27,4,1
4,11,AT-001881-02,98,5,0,0,1,0,1,1,1,1,3,2,4.00,10,2,5,0,4,0.0,3,1,4,3,4,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,non_applicable,1,1,1,1,1,1,1,1,0,4,3,4,3,1,17,1,1,1,1,4,2,2,10,1,0,non_applicable,0,0,0,0,0,0,3,1,1,1,0,1,0,0,48,0,0,0,0,1,1,0.00000,0.0,15600.00000,0,0.0,0.0000,0.0000,5000.0,0.0,0.0000,0.0,14240.0,0.0,0.0,0.0,3660.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,4200.0,non_applicable,1,0.000,5000.000,5000.000,5000.000,1,0,non_applicable,non_applicable,0,2,0,0,0,0,1,1,0,4.000000,3,3,3,3,10,37,3.0,4.0,3.5,2.0,2.5,33.96,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51715,14,SK-992332-01,76,5,0,1,1,0,1,1,1,1,1,2,3.00,8,2,4,0,1,0.0,3,0,3,3,3,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,non_applicable,0,0,0,0,0,0,0,0,0,4,4,3,3,3,13,0,1,1,5,5,1,2,12,1,0,non_applicable,0,1,0,0,0,0,5,1,0,2,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,0.00000,0.0,10680.00000,0,0.0,0.0000,0.0000,0.0,150000.0,0.0000,1000.0,6720.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,2400.0,non_applicable,1,151000.000,0.000,0.000,151000.000,1,0,non_applicable,non_applicable,0,0,0,0,0,0,0,0,0,0.000000,3,3,3,3,8,40,3.5,3.5,4.0,3.0,3.5,27.78,3,1
51716,14,SK-992332-02,73,5,1,1,1,0,1,1,1,1,1,2,3.00,7,2,4,0,1,0.0,3,0,3,3,3,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,3,7,4,3,18,0,1,1,2,3,2,1,4,1,0,0,0,1,0,0,0,0,5,1,0,1,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,0.00000,0.0,10680.00000,0,0.0,0.0000,0.0000,0.0,150000.0,0.0000,1000.0,3960.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,2400.0,non_applicable,1,151000.000,0.000,0.000,151000.000,1,0,non_applicable,non_applicable,0,0,0,0,0,0,0,0,0,0.000000,3,3,3,3,9,42,4.0,4.0,4.0,3.0,3.0,25.10,3,1
51717,14,SK-995042-01,63,5,1,1,1,0,1,1,0,2,4,4,4.00,10,3,4,0,1,0.0,5,1,5,5,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,3,3,4,3,4,23,0,1,1,1,2,2,2,12,0,1,1,1,1,0,0,0,0,2,1,1,2,0,0,0,0,0,0,0,0,0,1,2,0.00000,0.0,10200.00000,0,0.0,0.0000,0.0000,500.0,70000.0,0.0000,1000.0,4200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0.0,1800.0,non_applicable,0,36000.000,500.000,500.000,36500.000,1,0,non_applicable,non_applicable,0,0,0,0,0,0,0,0,0,0.000000,2,2,2,2,7,25,2.0,5.0,5.0,5.0,5.0,17.47,1,0
51718,14,SK-995042-02,66,5,0,1,1,0,1,0,0,2,4,4,4.00,9,3,4,1,1,0.0,3,0,3,4,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,4,3,6,5,4,46,0,3,3,1,1,1,1,3,0,0,non_applicable,1,0,0,0,0,0,4,1,0,2,0,0,0,0,0,0,0,0,1,0,non_applicable,0.00000,0.0,10200.00000,0,0.0,0.0000,0.0000,500.0,70000.0,0.0000,1000.0,6240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0.0,1800.0,non_applicable,0,36000.000,500.000,500.000,36500.000,1,0,non_applicable,non_applicable,0,3,0,0,0,0,1,1,1,3.666667,2,2,2,2,7,29,4.0,3.0,5.0,3.0,5.0,23.67,2,0


In [31]:
data[data['gender']==0]

Unnamed: 0,country_eu,mergeid,age,country,gender,edu_lv,marital,migration,citizen,religion,region,residence_type,network_size,integration,closeness,network_satisfaction,children,grandchildren,look_after_grand,home_own,mortgage,health_self,chronic,vision_d,vision_n,hearing,level_of_pain,glasses,hearing_aid,cane,walker,manual_wheelchair,eletric_wheelchair,buggy,utensils,personal_alarm,bar,raised_toilet,incontinence,heart,hypertension,cholesterol,stroke,diabetes,lung_disease,cancer,ulcer,parkinsons,cataracts,fracture,dementia,psycho,arthritis,kidney,depression,pessimism,suicidality,guilt,sleep,interest,irritability,appetite,fatigue,concentration,enjoyment,tearfulness,eurod_categ,limit_activity,limit_work,adls,iadls,mobility,uppermo,lowermo,lgmuscle,grossmo,finemo,fall,date_day,memory,imword_recall,deword_recall,serial_7,verbal,inactivity,drinking,smoking,dairy,legeggs,meat,fritsveg,outpatient,vaccination,eye_exam,mammogram,colon,dental,inpatient,other_facility,forgo_cost,forgo_available,health_literacy,medicine,polypharmacy,satisfied_insurance,supplementary_insurance,home_care,personal_care_hour,domestic_task_hour,meal_week,nursing_home,nursing_home_week,receive_help_freq,given_help_freq,activity_help,help_meet_need,income_work,income_self_work,hh_income,economic_status,income_house,long_term,bond_stock_funds,bank_account,value_house,value_business,value_car,oldage_pension,occupational_pension,disability_pension,unemployment_benefits,social_assistance,sickness_benefit,private_pensions,private_transfer,owe,given_gift_250,given_gift_5000,receive_gift_250,receive_gift_5000,rent_expenditure,hh_cosumption,unafford_food,make_ends_meet,hh_real_asset,hh_gross_asset,hh_net_asset,hh_net_worth,job,paidwork,other_job,employment,hours_work,number_social,voluntary,course,club,organization,read,games,card,social_freq,companionship,leftout,isolated,lonely,satisfied_life,casp_12,extraversion,aggreeableness,conscientiousness,neuroticism,openess,bmi,bmi_cate,overweight
0,11,AT-000787-01,58,5,0,2,1,0,1,0,0,1,7,4,4.00,10,0,0,non_applicable,1,0.0,2,0,2,1,2,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,5,5,5,27,0,2,2,1,3,5,1,3,1,0,non_applicable,1,1,0,0,0,0,5,0,0,1,1,0,0,0,0,0,0,0,1,non_applicable,non_applicable,60000.00000,25000.0,108000.00000,1,0.0,94341.57420,339774.1648,6000.0,1400000.0,406763.9454,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0.0,9600.0,0,1,1816763.976,440115.73900,440115.73900,2256879.715,2,non_applicable,1,2,60,2,0,1,0,0,1,0,0,2.500000,3,3,3,3,10,44,2.0,3.0,4.5,1.5,5.0,24.15,2,0
2,11,AT-001492-02,71,5,0,1,1,0,1,1,0,2,4,4,3.25,9,6,13,1,non_applicable,0.0,1,1,3,2,3,2,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,4,3,6,6,5,22,0,1,3,1,3,2,2,5,1,1,non_applicable,0,0,0,0,0,0,5,1,1,2,0,0,0,0,0,0,0,0,0,0,non_applicable,0.00000,0.0,27600.00000,1,0.0,28000.00000,0.0000,14000.0,0.0,0.0000,0.0,32200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,10680.0,6600.0,non_applicable,0,0.000,42000.00000,42000.00000,42000.000,1,0,non_applicable,non_applicable,0,4,1,0,1,0,1,0,1,2.750000,3,3,3,3,8,46,3.5,5.0,5.0,1.0,3.0,22.69,2,0
3,11,AT-001719-01,60,5,0,1,1,1,0,1,0,1,4,4,3.75,9,6,9,1,2,0.0,5,1,2,2,2,3,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,4,3,4,3,5,12,0,1,3,1,2,5,1,12,0,0,non_applicable,0,1,0,0,0,0,3,1,1,2,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,11181.92532,0.0,22957.80642,1,0.0,0.00000,0.0000,7000.0,0.0,0.0000,12000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3000.0,0,0,0,0,10800.0,3600.0,0,0,12000.000,7000.00000,4000.00000,16000.000,2,non_applicable,0,1,25,1,0,0,0,0,0,0,1,3.000000,3,3,3,3,7,39,4.5,4.0,3.0,1.5,1.5,38.27,4,1
4,11,AT-001881-02,98,5,0,0,1,0,1,1,1,1,3,2,4.00,10,2,5,0,4,0.0,3,1,4,3,4,0,1,1,1,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,non_applicable,1,1,1,1,1,1,1,1,0,4,3,4,3,1,17,1,1,1,1,4,2,2,10,1,0,non_applicable,0,0,0,0,0,0,3,1,1,1,0,1,0,0,48,0,0,0,0,1,1,0.00000,0.0,15600.00000,0,0.0,0.00000,0.0000,5000.0,0.0,0.0000,0.0,14240.0,0.0,0.0,0.0,3660.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,4200.0,non_applicable,1,0.000,5000.00000,5000.00000,5000.000,1,0,non_applicable,non_applicable,0,2,0,0,0,0,1,1,0,4.000000,3,3,3,3,10,37,3.0,4.0,3.5,2.0,2.5,33.96,4,1
6,11,AT-002242-01,55,5,0,1,1,0,1,0,1,1,1,2,4.00,10,1,0,non_applicable,1,0.0,2,0,2,2,3,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,3,7,7,5,25,0,1,1,1,1,2,1,2,0,1,non_applicable,0,1,1,0,0,0,5,1,0,3,1,0,0,0,0,0,0,0,1,non_applicable,non_applicable,0.00000,0.0,32422.50310,1,0.0,28291.22363,0.0000,4000.0,350000.0,1000000.0000,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15000.0,0,0,0,0,0.0,4800.0,non_applicable,1,1380000.076,32291.22363,17291.22363,1397291.300,2,non_applicable,0,3,50,2,0,0,0,0,1,1,0,3.000000,3,3,3,3,10,47,4.0,3.0,5.0,1.0,3.5,30.64,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51707,14,SK-983687-01,63,5,0,1,1,0,1,1,0,2,1,2,4.00,10,0,0,non_applicable,1,0.0,3,0,2,2,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,4,3,6,6,5,40,0,2,2,2,2,1,2,6,1,0,non_applicable,0,1,0,0,0,0,5,1,0,2,0,0,0,0,0,0,0,0,0,0,non_applicable,0.00000,6000.0,13080.00000,0,0.0,0.00000,0.0000,5000.0,130000.0,0.0000,10000.0,2520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,3600.0,non_applicable,0,75000.000,5000.00000,5000.00000,80000.000,1,1,non_applicable,3,0,1,0,0,0,0,1,0,0,4.000000,3,3,3,3,9,43,3.0,3.5,5.0,2.5,3.5,29.39,3,1
51710,14,SK-984724-02,55,5,0,1,1,0,1,0,1,2,1,2,4.00,10,2,0,non_applicable,1,0.0,2,0,2,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,7,5,4,23,0,1,1,2,3,2,2,0,0,0,non_applicable,0,0,0,0,0,0,5,0,0,1,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,11000.00000,0.0,204000.00000,1,0.0,0.00000,0.0000,2000.0,40000.0,0.0000,4000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,4200.0,non_applicable,1,44000.000,2000.00000,2000.00000,46000.000,2,non_applicable,0,1,40,0,0,0,0,0,0,0,0,0.000000,3,3,3,3,10,47,5.0,4.5,4.0,1.5,3.5,24.49,2,0
51712,14,SK-988505-01,67,5,0,1,1,0,1,1,0,2,4,4,4.00,10,3,2,0,1,0.0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1,5,5,5,24,0,1,3,3,4,2,1,0,0,0,non_applicable,0,1,0,0,0,0,5,0,0,1,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,0.00000,0.0,13200.00000,0,0.0,0.00000,0.0000,4000.0,100000.0,0.0000,0.0,7200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,3840.0,non_applicable,1,100000.000,4000.00000,4000.00000,104000.000,1,0,non_applicable,non_applicable,0,2,0,0,0,0,0,1,1,3.000000,3,3,3,3,10,48,2.0,4.5,5.0,2.5,4.5,21.61,2,0
51715,14,SK-992332-01,76,5,0,1,1,0,1,1,1,1,1,2,3.00,8,2,4,0,1,0.0,3,0,3,3,3,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,non_applicable,0,0,0,0,0,0,0,0,0,4,4,3,3,3,13,0,1,1,5,5,1,2,12,1,0,non_applicable,0,1,0,0,0,0,5,1,0,2,0,0,0,0,0,0,0,0,0,non_applicable,non_applicable,0.00000,0.0,10680.00000,0,0.0,0.00000,0.0000,0.0,150000.0,0.0000,1000.0,6720.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,2400.0,non_applicable,1,151000.000,0.00000,0.00000,151000.000,1,0,non_applicable,non_applicable,0,0,0,0,0,0,0,0,0,0.000000,3,3,3,3,8,40,3.5,3.5,4.0,3.0,3.5,27.78,3,1


In [34]:
# Drop the target variables of other papers
used_data = data.copy()
#used_data2 = data.copy()
#used_data = used_data.drop(['mergeid','country','citizen','network_size','integration','closeness','network_satisfaction','grandchildren','look_after_grand','vision_d','hearing','level_of_pain','glasses','walker','buggy','utensils','personal_alarm','bar','raised_toilet','incontinence','depression','pessimism','suicidality','guilt','sleep','interest','irritability','appetite','fatigue','concentration','enjoyment','tearfulness','adls','mobility','uppermo','lowermo','lgmuscle','grossmo','finemo','fall','date_day','memory','imword_recall','deword_recall','serial_7','verbal',
#                  'dairy','legeggs','meat','hours_work','personal_care_hour','domestic_task_hour','meal_week','nursing_home_week','receive_help_freq','given_help_freq','activity_help','help_meet_need','long_term','bond_stock_funds','bank_account','value_house','value_business','value_car','oldage_pension','occupational_pension','disability_pension','unemployment_benefits','social_assistance','sickness_benefit','private_pensions','private_transfer','given_gift_250','given_gift_5000','receive_gift_250','receive_gift_5000','rent_expenditure','hh_cosumption','unafford_food',
#                  'hh_real_asset','hh_gross_asset','hh_net_asset','hh_net_worth','employment','course','club','organization','read','games','card','companionship','leftout','isolated','lonely','extraversion','aggreeableness','conscientiousness','neuroticism','openess','bmi','bmi_cate','overweight'], axis=1)
#used_data2 = used_data2.drop(['mergeid','country','citizen','network_size','integration','closeness','network_satisfaction','grandchildren','look_after_grand','vision_d','hearing','level_of_pain','glasses','walker','buggy','utensils','personal_alarm','bar','raised_toilet','incontinence','depression','pessimism','suicidality','guilt','sleep','interest','irritability','appetite','fatigue','concentration','enjoyment','tearfulness','adls','mobility','uppermo','lowermo','lgmuscle','grossmo','finemo','fall','date_day','memory','imword_recall','deword_recall','serial_7','verbal',
#                  'dairy','legeggs','meat','hours_work','personal_care_hour','domestic_task_hour','meal_week','nursing_home_week','receive_help_freq','given_help_freq','activity_help','help_meet_need','long_term','bond_stock_funds','bank_account','value_house','value_business','value_car','oldage_pension','occupational_pension','disability_pension','unemployment_benefits','social_assistance','sickness_benefit','private_pensions','private_transfer','given_gift_250','given_gift_5000','receive_gift_250','receive_gift_5000','rent_expenditure','hh_cosumption','unafford_food',
#                  'hh_real_asset','hh_gross_asset','hh_net_asset','hh_net_worth','employment','course','club','organization','read','games','card','companionship','leftout','isolated','lonely','extraversion','aggreeableness','conscientiousness','neuroticism','openess','bmi','bmi_cate','overweight'], axis=1)

#used_data2 = used_data.copy()
used_data.shape

(51720, 176)

In [35]:
# 假設 DataFrame 為 used_data
# used_data = used_data.loc[used_data['mammogram'] != 'non_applicable']
#used_data = used_data[used_data['mammogram'] != 'non_applicable']
#used_data

In [36]:
# definition of mammogram, colon

# mammogram
#used_data['mammogram'] = (
#    (used_data['gender'] == 1) &  
#    (used_data['age'] >= 45) &   
#    (used_data['age'] <= 74)
#).astype(int)  # 將布林值轉為整數 (1 或 0)


# colon
# used_data2['colon'] = (
#     (used_data2['age'] >= 50) &  
#     (used_data2['age'] <= 60)   
# ).astype(int)  # 將布林值轉為整數 (1 或 0)
#used_data2 = used_data2[(used_data2['age'] >= 50) & (used_data2['age'] <= 60)]

In [37]:
#used_data2#['mammogram']

In [38]:
#print(used_data['mammogram'].value_counts())

In [39]:
#print(used_data2['colon'].value_counts())

In [40]:
#used_data['mammogram']

In [41]:
#used_data2['colon']

In [42]:
#derived_preventive_care (exclude 'mammogram')

vaccination = used_data['vaccination']
eye_exam = used_data['eye_exam']
colon = used_data['colon']
dental = used_data['dental']

# 定義條件：只要有一種檢查未做（值為0）就是 unmet (1)，全部都做了才是 met (0)
conditions = [
    (vaccination == 0) | (eye_exam == 0) | (colon == 0) | (dental == 0),  # 未完成的情況
    (vaccination == 1) & (eye_exam == 1) & (colon == 1) & (dental == 1)  # 全部完成的情況
]
choices = [1, 0]  # 1 表示 unmet，0 表示 met

# 根據條件新增或更新 'preventive_care' 欄位
used_data['preventive_care'] = np.select(conditions, choices, default=np.nan)

In [44]:
# derived_income
used_data['income'] = used_data['income_work']+ used_data['income_self_work'] + used_data['income_house']
# 刪除原本的變數
used_data.drop(columns=['income_work', 'income_self_work', 'income_house'], inplace=True)

In [46]:
#derived_wheelchair

manual_wheelchair = used_data['manual_wheelchair']
eletric_wheelchair = used_data['eletric_wheelchair']
conditions = [
    (manual_wheelchair == 1) | (eletric_wheelchair == 1),
    (manual_wheelchair == 0) & (eletric_wheelchair == 0)
]
choices = [1, 0]

#wheelchair: 若至少有一個條件符合設為1，完全不符合設為0
used_data['wheelchair'] = np.select(conditions, choices, default=np.nan)
# 刪除原本的變數
used_data.drop(columns=['manual_wheelchair', 'eletric_wheelchair'], inplace=True)

Descriptive summary

In [47]:
#Group Splitting
groups = {
    'Croatia': used_data['country_eu'] == 47,
    'Belgium': used_data['country_eu'] == 23,
    'Estonia': used_data['country_eu'] == 35,
    'Germany': used_data['country_eu'] == 12,
    'Italy': used_data['country_eu'] == 16,
    'Australia': used_data['country_eu'] == 11,
    'Czech Republic': used_data['country_eu'] == 28,
    'Greece': used_data['country_eu'] == 19,
    'France': used_data['country_eu'] == 17,
    'Denmark': used_data['country_eu'] == 18,
    'Spain': used_data['country_eu'] == 15,
    'Switzerland': used_data['country_eu'] == 20,
    'Hungary': used_data['country_eu'] == 32,
    'Finland': used_data['country_eu'] == 55,
    'Latvia': used_data['country_eu'] == 57,
    'Lithuania': used_data['country_eu'] == 48,
}

Sample Distribution (Cross-table)

In [50]:
import pandas as pd

# Assuming used_data2 is already defined
cross_table = pd.crosstab(used_data['country_eu'], used_data['vaccination'],  margins=True,)

# Display the cross-table
print(cross_table)
print("\n描述性統計：")
description = cross_table.describe()
print(description)

vaccination      0      1    All
country_eu                      
11            2570    835   3405
12            2026   2441   4467
14             184     40    224
15             759   1328   2087
16            2097   1526   3623
17            1869   1040   2909
18            1476    872   2348
19            1680   1432   3112
20             857    987   1844
23            2266   2231   4497
25             228    516    744
28            2021   1298   3319
31             612    198    810
32            1612    216   1828
35            1945   2541   4486
47            2807   1880   4687
48             777    637   1414
51             442    389    831
53             558    178    736
55            1295    463   1758
57            1352    364   1716
59             693    182    875
All          30126  21594  51720

描述性統計：
vaccination             0             1           All
count           23.000000     23.000000     23.000000
mean          2619.652174   1877.739130   4497.391304
std  

In [49]:
import pandas as pd
# Define categories and their subcategories
categories = {
    'age_group': ['50-54', '55-59', '60-64', '65-69', '70-74', '75+'],
    'gender': [0, 1],
    'edu_lv': [0, 1, 2],
}

# Create a new column for age group
bins = [50, 55, 60, 65, 70, 75, 100]
labels = ['50-54', '55-59', '60-64', '65-69', '70-74', '75+'] 
used_data2['age_group'] = pd.cut(used_data2['age'], bins=bins, labels=labels, right=False)

# Generate the data structure
table = {
    'Category': [],
    'Subcategory': [],
    'Group Size': [],
    'Group Proportion': [],
    'Overall Prediction': used_data2[target_var].sum(),
    'Prediction': [],
    'Prediction Proportion': []
}
for category, subcats in categories.items():
    grouped = used_data2.groupby(category)
    for subcat in subcats:
        subdata = grouped.get_group(subcat)
        table['Category'].append(category)
        table['Subcategory'].append(subcat)
        table['Group Size'].append(len(subdata))
        table['Group Proportion'].append(len(subdata) / len(data))
        table['Prediction'].append(subdata[target_var].sum())
        table['Prediction Proportion'].append(subdata[target_var].sum() / used_data2[target_var].sum())

table = pd.DataFrame(table)
table['Group Proportion'] = table['Group Proportion'].map('{:.2%}'.format)
table['Prediction Proportion'] = table['Prediction Proportion'].map('{:.2%}'.format)
# save the table
table.to_csv(f'{target_var}\\descriptive_table.csv', index=False)

NameError: name 'target_var' is not defined