# Library Load

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from imblearn.over_sampling import SVMSMOTE
from imblearn.combine import  SMOTETomek

import warnings
warnings.filterwarnings('ignore')

# Data load

In [10]:
# 데이터 불러오기
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [11]:
# raw_label.csv 생성
train['Y_LABEL'].to_csv('data/raw_label.csv', index = False)

In [14]:
# total, part train 생성
total_train = train.drop(['Y_LABEL','ID'], axis = 1)
part_train = total_train.drop(total_train.loc[:, total_train.isna().sum() / len(total_train) >= 0.2].columns, axis = 1)

In [18]:
# test data one-hot encoding
te1 = pd.get_dummies(test.COMPONENT_ARBITRARY)
te2 = pd.get_dummies(test.YEAR)
one_hot_test = test.drop(['COMPONENT_ARBITRARY'],axis = 1)
one_hot_test.drop(['YEAR','ID'], axis = 1, inplace = True)
one_hot_test = pd.concat([one_hot_test, te1],axis = 1)
one_hot_test = pd.concat([one_hot_test, te2],axis = 1)
one_hot_test.head()

Unnamed: 0,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,2192,200,0,0,0,1,12,0.0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2794,200,0,0,2,1,278,0.0,3,0,...,0,0,0,0,0,0,0,0,0,0
2,1982,200,0,0,0,16,5,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1404,200,0,0,3,4,163,0.0,4,3,...,0,0,0,0,0,0,0,0,0,0
4,8225,200,0,0,0,6,13,0.0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [19]:
one_hot_test.to_csv('data/one_hot_test.csv', index = False)

# Total

## One-Hot Encoding

In [21]:
# train
ttr1 = pd.get_dummies(total_train.COMPONENT_ARBITRARY)
ttr2 = pd.get_dummies(total_train.YEAR)
total_one_train = total_train.drop(['COMPONENT_ARBITRARY'],axis = 1)
total_one_train.drop(['YEAR'], axis = 1, inplace = True)
total_one_train = pd.concat([total_one_train, ttr1],axis = 1)
total_one_train = pd.concat([total_one_train, ttr2],axis = 1)
total_one_train.head()

Unnamed: 0,ANONYMOUS_1,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,BE,CA,CD,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1486,7,200,0,3,93,0,0,3059,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1350,51,375,0,2,19,0,0,2978,0.0,...,0,0,0,0,0,0,0,0,1,0
2,2415,2,200,0,110,1,1,0,17,0.0,...,0,0,1,0,0,0,0,0,0,0
3,7389,2,200,0,8,3,0,0,1960,0.0,...,0,0,0,0,0,0,0,0,0,0
4,3954,4,200,0,1,157,0,0,71,0.0,...,0,0,1,0,0,0,0,0,0,0


In [22]:
total_one_train.to_csv('data/total/one-hot_encoding/total_one_train.csv', index = False)

## IterativeImputer

In [24]:
total_one_iter_train = pd.DataFrame(IterativeImputer(random_state = 17).fit_transform(total_one_train), 
                                    columns = total_one_train.columns)

In [25]:
total_one_iter_train.to_csv('data/total/one-hot_encoding/iterative/total_one_iter_train.csv', index = False)

## Oversampling

In [28]:
# Oversampling
svm = SVMSMOTE(random_state = 17, n_jobs = -1)
total_one_iter_svm_train, total_one_iter_svm_label = svm.fit_sample(total_one_iter_train, train.Y_LABEL)
print(total_one_iter_svm_train.shape, total_one_iter_svm_label.shape)

tomek = SMOTETomek(random_state = 17, n_jobs = -1)
total_one_iter_tomek_train, total_one_iter_tomek_label = tomek.fit_sample(total_one_iter_train, train.Y_LABEL)
print(total_one_iter_tomek_train.shape, total_one_iter_tomek_label.shape)

(25784, 70) (25784,)
(24264, 70) (24264,)


In [29]:
# train data save
total_one_iter_svm_train.to_csv('data/total/one-hot_encoding/iterative/oversampling/total_one_iter_svm_train.csv', 
                                index = False)
total_one_iter_tomek_train.to_csv('data/total/one-hot_encoding/iterative/oversampling/total_one_iter_tomek_train.csv', 
                                  index = False)
# label data save
total_one_iter_svm_label.to_csv('data/total/one-hot_encoding/iterative/oversampling/label/total_one_iter_svm_label.csv', 
                                index = False)
total_one_iter_tomek_label.to_csv('data/total/one-hot_encoding/iterative/oversampling/label/total_one_iter_tomek_label.csv', 
                                  index = False)

# Part

## One- Hot Encoding

In [30]:
# train
ptr1 = pd.get_dummies(part_train.COMPONENT_ARBITRARY)
ptr2 = pd.get_dummies(part_train.YEAR)
part_one_train = part_train.drop(['COMPONENT_ARBITRARY'],axis = 1)
part_one_train.drop(['YEAR'], axis = 1, inplace = True)
part_one_train = pd.concat([part_one_train, ptr1],axis = 1)
part_one_train = pd.concat([part_one_train, ptr2],axis = 1)
part_one_train.head()

Unnamed: 0,ANONYMOUS_1,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,BE,CA,CD,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1486,7,200,0,3,93,0,0,3059,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1350,51,375,0,2,19,0,0,2978,0.0,...,0,0,0,0,0,0,0,0,1,0
2,2415,2,200,0,110,1,1,0,17,0.0,...,0,0,1,0,0,0,0,0,0,0
3,7389,2,200,0,8,3,0,0,1960,0.0,...,0,0,0,0,0,0,0,0,0,0
4,3954,4,200,0,1,157,0,0,71,0.0,...,0,0,1,0,0,0,0,0,0,0


In [31]:
part_one_train.to_csv('data/part/one-hot_encoding/part_one_train.csv', index = False)

## IterativeImputer

In [32]:
part_one_iter_train = pd.DataFrame(IterativeImputer(random_state = 17).fit_transform(part_one_train), 
                                   columns = part_one_train.columns)

part_one_iter_train.to_csv('data/part/one-hot_encoding/iterative/part_one_iter_train.csv', index = False)

## Oversampling

In [34]:
# Oversampling
svm = SVMSMOTE(random_state = 17, n_jobs = -1)
part_one_iter_svm_train, part_one_iter_svm_label = svm.fit_sample(part_one_iter_train, train.Y_LABEL)
print(part_one_iter_svm_train.shape, part_one_iter_svm_label.shape)

tomek = SMOTETomek(random_state = 17, n_jobs = -1)
part_one_iter_tomek_train, part_one_iter_tomek_label = tomek.fit_sample(part_one_iter_train, train.Y_LABEL)
print(part_one_iter_tomek_train.shape, part_one_iter_tomek_label.shape)

(25784, 53) (25784,)
(25506, 53) (25506,)


In [35]:
# Data save
part_one_iter_svm_train.to_csv('data/part/one-hot_encoding/iterative/oversampling/part_one_iter_svm_train.csv', 
                                index = False)
part_one_iter_tomek_train.to_csv('data/part/one-hot_encoding/iterative/oversampling/part_one_iter_tomek_train.csv', 
                                  index = False)

part_one_iter_svm_label.to_csv('data/part/one-hot_encoding/iterative/oversampling/label/part_one_iter_svm_label.csv', 
                                index = False)
part_one_iter_tomek_label.to_csv('data/part/one-hot_encoding/iterative/oversampling/label/part_one_iter_tomek_label.csv', 
                                  index = False)