# Data processing for modeling

# Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

from constants import CAT_FEATURES, NUM_FEATURES

from sklearn.preprocessing import StandardScaler
from category_encoders import BinaryEncoder
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from tqdm import tqdm

# Data Loading

In [2]:
X_train = pd.read_parquet("../../../../data/interim/copper/X_train.parquet")
X_test = pd.read_parquet("../../../../data/interim/copper/X_test.parquet")
y_train_cat = pd.read_parquet("../../../../data/interim/copper/y_train_cat.parquet")

# Processamento

In [3]:
#Binary encoding for categorical features
binary_encoder = BinaryEncoder(cols=CAT_FEATURES)
X_train_cat = binary_encoder.fit_transform(X_train[CAT_FEATURES])
X_test_cat = binary_encoder.transform(X_test[CAT_FEATURES])

#Standard scaling for numerical features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train[NUM_FEATURES])
X_test_scaled = scaler.transform(X_test[NUM_FEATURES])

X_train_scaled = pd.DataFrame(X_train_scaled, columns=NUM_FEATURES, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=NUM_FEATURES, index=X_test.index)

X_train_pro = pd.concat([X_train_scaled, X_train_cat], axis=1)
X_test_pro = pd.concat([X_test_scaled, X_test_cat], axis=1)

#KNN imputer for missing values
imputer = KNNImputer(n_neighbors=3)
X_train_knn = imputer.fit_transform(X_train_pro)
X_test_knn = imputer.transform(X_test_pro)

X_train_knn = pd.DataFrame(X_train_knn, columns=X_train_pro.columns, index=X_train_pro.index)
X_test_knn = pd.DataFrame(X_test_knn, columns=X_test_pro.columns, index=X_test_pro.index)

X_list = []
y_list = []

#Repeted oversampling with SMOTE and random undersampling for data generation
for i in tqdm(range(10)):

    over = SMOTE(sampling_strategy="auto", random_state=i)
    X_train_knn_res, y_train_knn_res = over.fit_resample(X_train_knn, y_train_cat)
        
    under = RandomUnderSampler(sampling_strategy="auto", random_state=i)
    X_train_knn_res, y_train_knn_res = under.fit_resample(X_train_knn_res, y_train_knn_res)

    X_list.append(X_train_knn_res)
    y_list.append(y_train_knn_res)

 10%|█         | 1/10 [00:00<00:01,  6.85it/s]

100%|██████████| 10/10 [00:00<00:00, 27.29it/s]


# Saving

In [4]:
#Processed data is saved in parquet format
X_train_knn.to_parquet("../../../../data/interim/copper/X_train_rf.parquet")
X_test_knn.to_parquet("../../../../data/interim/copper/X_test_rf.parquet")

for i, (X, y) in enumerate(zip(X_list, y_list)):
    X.to_parquet(f"../../../../data/interim/copper/X_train_bal/X_{i}.parquet")
    y.to_parquet(f"../../../../data/interim/copper/y_train_bal/y_{i}.parquet")
