In [57]:
# Import packages
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

# Set options
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

train_x_raw = pd.read_csv("../01-Data/X_train.csv", low_memory = True, index_col=0)
train_y_raw = pd.read_csv("../01-Data/y_train.csv", low_memory = True, index_col=0)
test_x_raw = pd.read_csv("../01-Data/X_test.csv", low_memory=True, index_col=0)

df_train = pd.DataFrame(train_x_raw)
df_test = pd.DataFrame(test_x_raw)
df_y = pd.DataFrame(train_y_raw)

Add all of the preprocessing below

## Variable 1 - 146 Preprocessing

In [58]:
columns_to_drop = ['c_abrv', 'f46_IT', 'v72_DE', 'v73_DE', 'v74_DE', 'v75_DE', 'v76_DE', 'v77_DE', 'v78_DE', 'v79_DE']
df_train.drop(columns=columns_to_drop, inplace=True)
df_test.drop(columns=columns_to_drop, inplace=True)

## Variable 147 - 292 Preprocessing 

In [59]:
### Function to find the targeted colname
def find_colname(data, target):
    temp = []
    for varname in data.columns:
        if varname.endswith(target):
            temp.append(varname)
    return(temp)

merge_colname = find_colname(train_x_raw, '_11c')
print(find_colname(train_x_raw, 'c'))
print(find_colname(train_x_raw, '_r'))

def merge_columns(dat, colname):
    for name in colname:
        name_org = name.replace("_11c", "")
        dat.loc[dat[name_org] == -4, name_org] = dat.loc[dat[name_org] == -4, name]

['v30c', 'v45c', 'v133_11c', 'v134_11c', 'v135_11c', 'v136_11c', 'v137_11c', 'v138_11c', 'v139_11c', 'v140_11c', 'v141_11c']
['age_r', 'v228b_r', 'v231b_r', 'v233b_r', 'v239_r', 'v242_r', 'v243_r', 'v251b_r', 'v252_r', 'v261_r', 'v262_r', 'v263_r', 'v276_r', 'v278c_r', 'v279c_r', 'v279d_r', 'v281a_r']


## Variable 293 - 438 Preprocessing

In [60]:
## removed string type data
df_train.drop('v228b', inplace=True, axis=1) 
df_test.drop('v228b', inplace=True, axis=1) 

df_train.fillna({'v228b_r': -3}, inplace=True)
df_test.fillna({'v228b_r': -3}, inplace=True)

df_train.drop('v231b', inplace=True, axis=1) 
df_test.drop('v231b', inplace=True, axis=1)

df_train.fillna({'v231b_r': -3}, inplace=True)
df_test.fillna({'v231b_r': -3}, inplace=True)

df_train.drop('v233b', inplace=True, axis=1)
df_test.drop('v233b', inplace=True, axis=1)

df_train.fillna({'v233b_r': -3}, inplace=True)
df_test.fillna({'v233b_r': -3}, inplace=True)

df_train.drop('v251b', inplace=True, axis=1)
df_test.drop('v251b', inplace=True, axis=1) 

df_train.fillna({'v251b_r': -3}, inplace=True)
df_test.fillna({'v251b_r': -3}, inplace=True)

df_train.drop('f252_edulvlb_CH', inplace=True, axis=1)
df_test.drop('f252_edulvlb_CH', inplace=True, axis=1)

## removed the column having 'DE'
df_train.drop(list(df_train.filter(regex='DE')), axis=1, inplace=True)
df_test.drop(list(df_test.filter(regex='DE')), axis=1, inplace=True)

## removed the column having 'GB'
df_train.drop(list(df_train.filter(regex='GB')), axis=1, inplace=True)
df_test.drop(list(df_test.filter(regex='GB')), axis=1, inplace=True)

df_train.drop('v281a', inplace=True, axis=1)
df_test.drop('v281a', inplace=True, axis=1)

df_train.drop('v275b_N2', inplace=True, axis=1) 
df_test.drop('v275b_N2', inplace=True, axis=1) 

df_train.drop('v275b_N1', inplace=True, axis=1) 
df_test.drop('v275b_N1', inplace=True, axis=1) 

# Simple Model Run
##  xgBoost

In [61]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

label_mapping = {-1: 0, 1: 1, 2: 2, 3: 3, 4: 4}
df_y = df_y.replace(label_mapping)

dtrain = xgb.DMatrix(df_train, label=df_y, enable_categorical=True)
dtest = xgb.DMatrix(df_test, enable_categorical=True)

params = {
    'max_depth': 6,
    'eta': 0.01,
    'objective': 'multi:softprob',
    'num_class': 5,
    'eval_metric': 'mlogloss',
}
num_boost_round = 500

bst = xgb.train(params, dtrain, num_boost_round)
y_test_probs = bst.predict(dtest)

# Compute Multiclass Logarithmic Loss
#val_log_loss = log_loss(y_val, y_test_probs)
#print(f"Validation Multiclass Logarithmic Loss: {val_log_loss}")

class_order = [0, 1, 2, 3, 4]
class_mapping = {class_label: f"Class_{class_label}" for class_label in class_order}

submission_df = pd.DataFrame(y_test_probs, columns=class_mapping.values())
submission_df.columns = ['no answer', 'very important', 'quite important', 'not important', 'not at all important']
submission_df.insert(0, 'id', df_test.index)

# Save the submission file
submission_file = ('submission.csv')
submission_df.to_csv(submission_file, index=False)