In [233]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings(action='ignore') 

In [234]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
data_info = pd.read_excel("./data/data_info.xlsx")

In [235]:
X = train[['COMPONENT_ARBITRARY', 'YEAR', "AG"]]
X

Unnamed: 0,COMPONENT_ARBITRARY,YEAR,AG
0,COMPONENT3,2011,0
1,COMPONENT2,2021,0
2,COMPONENT2,2015,0
3,COMPONENT3,2010,0
4,COMPONENT3,2015,0
...,...,...,...
14090,COMPONENT3,2014,0
14091,COMPONENT1,2013,0
14092,COMPONENT3,2008,0
14093,COMPONENT2,2009,0


In [236]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first', sparse=False), [0, 1])], remainder='passthrough')

In [237]:
X = ct.fit_transform(X)
print(X)

[[0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [238]:
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14090,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14092,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14093,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [239]:
new_name = ct.get_feature_names()
new_name

['encoder__x0_COMPONENT2',
 'encoder__x0_COMPONENT3',
 'encoder__x0_COMPONENT4',
 'encoder__x1_2008',
 'encoder__x1_2009',
 'encoder__x1_2010',
 'encoder__x1_2011',
 'encoder__x1_2012',
 'encoder__x1_2013',
 'encoder__x1_2014',
 'encoder__x1_2015',
 'encoder__x1_2016',
 'encoder__x1_2017',
 'encoder__x1_2018',
 'encoder__x1_2019',
 'encoder__x1_2020',
 'encoder__x1_2021',
 'encoder__x1_2022',
 'AG']

In [240]:
X.columns = new_name
X

Unnamed: 0,encoder__x0_COMPONENT2,encoder__x0_COMPONENT3,encoder__x0_COMPONENT4,encoder__x1_2008,encoder__x1_2009,encoder__x1_2010,encoder__x1_2011,encoder__x1_2012,encoder__x1_2013,encoder__x1_2014,encoder__x1_2015,encoder__x1_2016,encoder__x1_2017,encoder__x1_2018,encoder__x1_2019,encoder__x1_2020,encoder__x1_2021,encoder__x1_2022,AG
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14090,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14092,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14093,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
new_train = pd.concat([train, X], axis=1)
new_train

Unnamed: 0,ID,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,...,encoder__x1_2014,encoder__x1_2015,encoder__x1_2016,encoder__x1_2017,encoder__x1_2018,encoder__x1_2019,encoder__x1_2020,encoder__x1_2021,encoder__x1_2022,AG.1
0,TRAIN_00000,COMPONENT3,1486,2011,7,200,0,3,93,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TRAIN_00001,COMPONENT2,1350,2021,51,375,0,2,19,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,TRAIN_00002,COMPONENT2,2415,2015,2,200,0,110,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TRAIN_00003,COMPONENT3,7389,2010,2,200,0,8,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TRAIN_00004,COMPONENT3,3954,2015,4,200,0,1,157,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14090,TRAIN_14090,COMPONENT3,1616,2014,8,200,0,2,201,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14091,TRAIN_14091,COMPONENT1,2784,2013,2,200,0,3,85,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14092,TRAIN_14092,COMPONENT3,1788,2008,9,550,0,6,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14093,TRAIN_14093,COMPONENT2,2498,2009,19,550,0,2,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
