In [15]:
# Import packages
import pandas as pd
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

# Set options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

train_x_raw = pd.read_csv("../01-Data/X_train.csv", low_memory = True, index_col=0)
train_y_raw = pd.read_csv("../01-Data/y_train.csv", low_memory = True, index_col=0)
test_x_raw = pd.read_csv("../01-Data/X_test.csv", low_memory=True, index_col=0)

df_train = pd.DataFrame(train_x_raw)
df_test = pd.DataFrame(test_x_raw)
df_y = pd.DataFrame(train_y_raw)

## Variable 293 - 438 Preprocessing
### Imputation / String variable drop

In [None]:
columns_to_drop = ['v228b', 'v231b', 'v233b', 'v251b', 'f252_edulvlb_CH' 'v275b_N1', 'v275b_N2', 'v275c_N2', 'v281a']
df_train.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

## removed the column having 'GB'
df_train.drop(list(df_train.filter(regex='DE')), axis=1, inplace=True)
df_test.drop(list(df_test.filter(regex='DE')), axis=1, inplace=True)

## removed the column having 'GB'
df_train.drop(list(df_train.filter(regex='GB')), axis=1, inplace=True)
df_test.drop(list(df_test.filter(regex='GB')), axis=1, inplace=True)

# Imputation 
df_train.fillna({'v231b_r': -3}, inplace=True)
df_test.fillna({'v231b_r': -3}, inplace=True)

df_train.fillna({'v233b_r': -3}, inplace=True)
df_test.fillna({'v233b_r': -3}, inplace=True)

df_train.fillna({'v251b_r': -3}, inplace=True)
df_test.fillna({'v251b_r': -3}, inplace=True)

df_train.fillna({'v228b_r': -3}, inplace=True)
df_test.fillna({'v228b_r': -3}, inplace=True)

### Age-related variables processing

In [ ]:
# v226 year of birth respondent (Q64)
# age age:respondent
# age_r age recorded (6 intervals)
# age_r2 age recoded (3 intervals)
# age_r3 age recoded (7 intervals)

ages = ['v226', 'age', 'age_r', 'age_r2', 'age_r3']
#df_train.drop(columns=ages_to_drop, inplace=True)
#df_test.drop(columns=ages_to_drop, inplace=True)
# DECIDE WHICH ONE TO KEEP AFTER EVALUATING 

### Education level-related variables drop

In [None]:
def find_colname(data, target):
    temp = []
    for varname in data.columns:
        if varname.startswith(target):
            temp.append(varname)
    return(temp)

# v243*: educational level respondent: ... with variations

print(find_colname(df_train, 'v243'))

In [14]:
# keep v243_ISCED_3: educational level respondent: ISCED-code three digit  
v243_to_drop = ['v243_edulvlb', 'v243_edulvlb_2', 'v243_edulvlb_1', 'v243_ISCED_2', 'v243_ISCED_2b','v243_ISCED_1', 'v243_EISCED', 'v243_ISCED97', 'v243_8cat', 'v243_r', 'v243_cs', 'v243_cs_DE1', 'v243_cs_DE2', 'v243_cs_DE3', 'v243_cs_GB1', 'v243_cs_GB2']

df_train.drop(columns=v243_to_drop, inplace=True)
df_test.drop(columns=v243_to_drop, inplace=True)

[]


KeyError: "['v243_edulvlb', 'v243_edulvlb_2', 'v243_edulvlb_1v243_ISCED_3', 'v243_ISCED_2', 'v243_ISCED_2b', 'v243_ISCED_1', 'v243_EISCED', 'v243_ISCED97', 'v243_8cat', 'v243_r', 'v243_cs', 'v243_cs_DE1', 'v243_cs_DE2', 'v243_cs_DE3', 'v243_cs_GB1', 'v243_cs_GB2'] not found in axis"

### Job kinds-related variables drop

In [17]:
print(find_colname(df_train, 'v246'))

['v246_ISCO_2', 'v246_SIOPS', 'v246_ISEI', 'v246_ESeC', 'v246_egp']


In [ ]:
# keep v246_ESeC : kind of job respondent - ESEC08 code  
v246_to_drop = ['v246_ISCO_2', 'v246_SIOPS', 'v246_ISEI', 'v246_egp']

df_train.drop(columns=v246_to_drop, inplace=True)
df_test.drop(columns=v246_to_drop, inplace=True)

### Partner Education Level variables drop

In [18]:
print(find_colname(df_train, 'v252'))

['v252_edulvlb', 'v252_edulvlb_2', 'v252_edulvlb_1', 'v252_ISCED_3', 'v252_ISCED_2', 'v252_ISCED_2b', 'v252_ISCED_1', 'v252_EISCED', 'v252_ISCED97', 'v252_8cat', 'v252_r', 'v252_cs', 'v252_cs_DE1', 'v252_cs_DE2', 'v252_cs_DE3', 'v252_cs_GB1', 'v252_cs_GB2']


In [ ]:
# keep v252_edulvlb_2: educational level spouse/partner: ESS-edulvlb coding two digits 
v252_to_drop = ['v252_edulvlb', 'v252_edulvlb_1', 'v252_ISCED_3', 'v252_ISCED_2', 'v252_ISCED_2b', 'v252_ISCED_1', 'v252_EISCED', 'v252_ISCED97', 'v252_8cat', 'v252_r', 'v252_cs', 'v252_cs_DE1', 'v252_cs_DE2', 'v252_cs_DE3', 'v252_cs_GB1', 'v252_cs_GB2']

df_train.drop(columns=v252_to_drop, inplace=True)
df_test.drop(columns=v252_to_drop, inplace=True)

### Kind of job partner variables drop

In [19]:
print(find_colname(df_train, 'v255'))

['v255_ISCO_2', 'v255_SIOPS', 'v255_ISEI', 'v255_ESeC', 'v255_egp']


In [ ]:
# keep v255_ESeC: kind of job spouse/partner - ESEC08 code 
v255_to_drop = ['v255_ISCO_2', 'v255_SIOPS', 'v255_ISEI', 'v255_egp']

df_train.drop(columns=v255_to_drop, inplace=True)
df_test.drop(columns=v255_to_drop, inplace=True)

### Households income variables to drop

In [20]:
print(find_colname(df_train, 'v261'))

['v261', 'v261_ppp', 'v261_r']


In [ ]:
df_train.drop('v261_ppp', inplace=True)
df_test.drop('v261_ppp', inplace=True)

### education level father/mother variables drop

In [21]:
print(find_colname(df_train, 'v262'))

['v262_edulvlb', 'v262_edulvlb_2', 'v262_edulvlb_1', 'v262_ISCED_3', 'v262_ISCED_2', 'v262_ISCED_2b', 'v262_ISCED_1', 'v262_EISCED', 'v262_ISCED97', 'v262_8cat', 'v262_r', 'v262_cs', 'v262_cs_DE1', 'v262_cs_DE2', 'v262_cs_DE3', 'v262_cs_GB1', 'v262_cs_GB2']


In [ ]:
# keep v262_edulvlb_2: educational level father: ESS-edulvlb coding two digits 
v262_to_drop = ['v262_edulvlb', 'v262_edulvlb_1', 'v262_ISCED_3', 'v262_ISCED_2', 'v262_ISCED_2b', 'v262_ISCED_1', 'v262_EISCED', 'v262_ISCED97', 'v262_8cat', 'v262_r', 'v262_cs', 'v262_cs_DE1', 'v262_cs_DE2', 'v262_cs_DE3', 'v262_cs_GB1', 'v262_cs_GB2']

df_train.drop(columns=v262_to_drop, inplace=True)
df_test.drop(columns=v262_to_drop, inplace=True)

In [22]:
print(find_colname(df_train, 'v263'))

['v263_edulvlb', 'v263_edulvlb_2', 'v263_edulvlb_1', 'v263_ISCED_3', 'v263_ISCED_2', 'v263_ISCED_2b', 'v263_ISCED_1', 'v263_EISCED', 'v263_ISCED97', 'v263_8cat', 'v263_r', 'v263_cs', 'v263_cs_DE1', 'v263_cs_DE2', 'v263_cs_DE3', 'v263_cs_GB1', 'v263_cs_GB2']


In [ ]:
# keep v263_edulvlb_2:educational level mother: ESS-edulvlb coding two digits
v263_to_drop = ['v263_edulvlb', 'v263_edulvlb_2', 'v263_edulvlb_1', 'v263_ISCED_3', 'v263_ISCED_2', 'v263_ISCED_2b', 'v263_ISCED_1', 'v263_EISCED', 'v263_ISCED97', 'v263_8cat', 'v263_r', 'v263_cs', 'v263_cs_DE1', 'v263_cs_DE2', 'v263_cs_DE3', 'v263_cs_GB1', 'v263_cs_GB2']

df_train.drop(columns=v263_to_drop, inplace=True)
df_test.drop(columns=v263_to_drop, inplace=True)

### Interview dates variables drop

In [ ]:
# v277: date of interview 
# v278a: time of interview: start hour 
# v278b: time of interview: start minute 
# v278c_r: time of interview: start  
# v279a: time of interview: end hour 
# v279b: time of interview: end minute 
# v279c_r: time of interview: end 
# v279d_r: time of interview: duration in minutes 

times_to_drop = ['v277', 'v278b', 'v278c_r', 'v279a', 'v279b', 'v279c_r']

df_train.drop(columns=times_to_drop, inplace=True)
df_test.drop(columns=times_to_drop, inplace=True)

### Age related variables group into intervals

In [ ]:
# v241, v242
# more to add