In [26]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from tabulate import tabulate

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

X_train = pd.read_csv("../01-Data/X_train.csv", index_col=0)
X_test = pd.read_csv("../01-Data/X_test.csv", index_col=0)
y_train = pd.read_csv("../01-Data/y_train.csv", index_col=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)

Add all of the preprocessing below

## Variable 1 - 146 Preprocessing

In [27]:
columns_to_drop = ['c_abrv', 'f46_IT', 'v72_DE', 'v73_DE', 'v74_DE', 'v75_DE', 'v76_DE', 'v77_DE', 'v78_DE', 'v79_DE']
X_train.drop(columns=columns_to_drop, inplace=True)
X_test.drop(columns=columns_to_drop, inplace=True)

## Variable 147 - 292 Preprocessing 

In [28]:
### Function to find the targeted colname
def find_colname(data, target):
    temp = []
    for varname in data.columns:
        if varname.endswith(target):
            temp.append(varname)
    return(temp)

merge_colname = find_colname(X_train, '_11c')
print(find_colname(X_train, 'c'))
print(find_colname(X_train, '_r'))

for name in merge_colname:
    name_org = name.replace("_11c", "")
    X_train.loc[X_train[name_org] == -4, name_org] = X_train.loc[X_train[name_org] == -4, name]
    
Answer_table = [[-10, "multiple answers Mail"], [-2, "no answer"], [-1, "dont know"], [1, "a great deal"], [2, "quite a lot"],
                [3, "not very much"], [4, "none at all"]]
print(tabulate(Answer_table, headers=["Numeric Value", "Survey Answer"], tablefmt='github'))

['v30c', 'v45c', 'v133_11c', 'v134_11c', 'v135_11c', 'v136_11c', 'v137_11c', 'v138_11c', 'v139_11c', 'v140_11c', 'v141_11c']
['age_r', 'v228b_r', 'v231b_r', 'v233b_r', 'v239_r', 'v242_r', 'v243_r', 'v251b_r', 'v252_r', 'v261_r', 'v262_r', 'v263_r', 'v276_r', 'v278c_r', 'v279c_r', 'v279d_r', 'v281a_r']
|   Numeric Value | Survey Answer         |
|-----------------|-----------------------|
|             -10 | multiple answers Mail |
|              -2 | no answer             |
|              -1 | dont know             |
|               1 | a great deal          |
|               2 | quite a lot           |
|               3 | not very much         |
|               4 | none at all           |


## Variable 293 - 438 Preprocessing

In [29]:
## removed string type data
X_train.drop('v228b', inplace=True, axis=1) 
X_test.drop('v228b', inplace=True, axis=1) 

X_train['v228b_r'].fillna(-3, inplace = True)
X_test['v228b_r'].fillna(-3, inplace = True)

X_train.drop('v231b', inplace=True, axis=1) 
X_test.drop('v231b', inplace=True, axis=1)

X_train['v231b_r'].fillna(-3, inplace = True)
X_test['v231b_r'].fillna(-3, inplace = True)

X_train.drop('v233b', inplace=True, axis=1)
X_test.drop('v233b', inplace=True, axis=1)

X_train['v233b_r'].fillna(-3, inplace = True)
X_test['v233b_r'].fillna(-3, inplace = True)

X_train.drop('v251b', inplace=True, axis=1)
X_test.drop('v251b', inplace=True, axis=1) 

X_train['v251b_r'].fillna(-3, inplace = True)
X_test['v251b_r'].fillna(-3, inplace = True)

X_train.drop('f252_edulvlb_CH', inplace=True, axis=1)
X_test.drop('f252_edulvlb_CH', inplace=True, axis=1)

## removed the column having 'DE'
X_train.drop(list(X_train.filter(regex='DE')), axis=1, inplace=True)
X_test.drop(list(X_test.filter(regex='DE')), axis=1, inplace=True)

## removed the column having 'GB'
X_train.drop(list(X_train.filter(regex='GB')), axis=1, inplace=True)
X_test.drop(list(X_test.filter(regex='GB')), axis=1, inplace=True)

X_train.drop('v281a', inplace=True, axis=1)
X_test.drop('v281a', inplace=True, axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['v228b_r'].fillna(-3, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['v228b_r'].fillna(-3, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values al

# Simple Model Run

In [30]:
# Prepare the DMatrix 
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# Set XGBoost parameters 
params = {
    'max_depth': 6,
    'eta': 0.01,
    'objective': 'multi:softprob',
    'num_class': 5,  # We have 5 classes: -1, 1, 2, 3, 4
    'eval_metric': 'mlogloss',
}
num_boost_round = 400  # Number of boosting rounds

bst = xgb.train(params, dtrain, num_boost_round)
y_test_probs = bst.predict(dtest)

class_order = [-1, 1, 2, 3, 4]
class_mapping = {class_label: f"Class_{class_label}" for class_label in class_order}

submission_df = pd.DataFrame(y_test_probs, columns=class_mapping.values())
submission_df.columns = ['no answer', 'very important', 'quite important', 'not important', 'not at all important']
submission_df.insert(0, 'id', X_test.index)

# Save the submission file
submission_file = 'submission.csv'
submission_df.to_csv(submission_file, index=False)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:v275b_N2: object, v275b_N1: object