# Data Preprocessing
We will preprocess the data by removing outliers, scaling the predictor values, and encoding the objective classes label. The scaling will be done using a saved scaler on the training data, which will be used for scaling validation, testing, or future input data.

Preprocessing steps used are based on our Exploratory Data Analysis (EDA) result and findings.

In [1]:
# change current working system path
import sys
sys.path.append('..') 

In [2]:
import src.util as utils

import yaml
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
# change current os working directory

os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()

'd:\\ML\\PACMANN INTRO PROJECT'

Load configuration from config/config.yml

In [4]:
config = utils.load_config()

In [5]:
X_train = utils.pickle_load(config['train_set_path'][0])
y_train = utils.pickle_load(config['train_set_path'][1])
train_set = pd.concat([X_train, y_train], axis=1)
train_set

Unnamed: 0,MQ2,MQ3,MQ5,MQ6,MQ7,MQ8,MQ135,Gas
160,702,521,418,413,647,646,450,NoGas
3866,553,346,321,364,577,558,282,Smoke
5337,526,423,378,351,404,270,388,Mixture
819,770,530,423,421,586,572,461,NoGas
4476,762,411,395,386,598,644,374,Smoke
...,...,...,...,...,...,...,...,...
5785,595,409,384,354,411,267,384,Mixture
2893,806,531,529,509,657,767,572,Perfume
2285,700,528,375,368,565,525,433,Perfume
6311,647,441,437,396,454,298,415,Mixture


### Data Loading
From pickle files generated in data preparation step.

In [6]:
X_test = utils.pickle_load(config['test_set_path'][0])
y_test = utils.pickle_load(config['test_set_path'][1])
test_set = pd.concat([X_test, y_test], axis=1)
test_set

Unnamed: 0,MQ2,MQ3,MQ5,MQ6,MQ7,MQ8,MQ135,Gas
5986,539,433,381,359,398,280,403,Mixture
1794,740,521,458,456,680,739,495,Perfume
3934,653,352,339,369,575,580,297,Smoke
1606,794,519,515,448,692,726,528,Perfume
2779,808,527,531,518,685,788,587,Perfume
...,...,...,...,...,...,...,...,...
3409,685,359,396,397,570,578,295,Smoke
3492,555,344,338,375,572,558,282,Smoke
3108,741,524,439,437,661,684,468,Perfume
1704,770,524,505,476,698,741,512,Perfume


In [7]:
X_val = utils.pickle_load(config['val_set_path'][0])
y_val = utils.pickle_load(config['val_set_path'][1])
val_set = pd.concat([X_val, y_val], axis=1)
val_set

Unnamed: 0,MQ2,MQ3,MQ5,MQ6,MQ7,MQ8,MQ135,Gas
2202,799,529,515,507,696,768,527,Perfume
5614,691,445,466,420,493,368,444,Mixture
6242,669,448,444,410,483,337,441,Mixture
6231,704,456,494,456,611,464,472,Mixture
5431,663,423,456,415,535,391,444,Mixture
...,...,...,...,...,...,...,...,...
6348,584,428,404,370,421,272,393,Mixture
4401,756,406,373,388,594,631,340,Smoke
3465,614,347,353,383,572,566,284,Smoke
3198,738,529,396,397,566,579,442,Perfume


### Outlier Removal
We will use Tukey's method, also known as the IQR method, with a threshold of 1.5 to remove outliers from the training set.

In [8]:
def outlier_removal(column: pd.Series):
    q1 = column.quantile(0.25)
    q3 = column.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return column[(column > lower_bound) & (column < upper_bound)]

In [9]:
for column in config['predictors']:
    X_train[column] = outlier_removal(X_train[column])

In [10]:
for column in config['predictors']:
    X_train[column] = outlier_removal(X_train[column])
y_train = y_train[X_train.notna().all(axis=1)]
X_train = X_train.dropna(how='any')

### Data scaling

In [11]:
def standard_scaler(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, scaler

In [12]:
X_train_feng, scaler = standard_scaler(X_train)
utils.pickle_dump(scaler, config['scaler_path'])

In [13]:
X_test_feng = scaler.transform(X_test)
X_val_feng = scaler.transform(X_val)

### Data labeling

In [14]:
def label_encoder(y, target_classes, save_encoder_classes=False, config_file=None):
    encoder = LabelEncoder()
    encoder.fit(target_classes)
    y_label_encoded = encoder.transform(y)

    if save_encoder_classes and config_file:
        # config_dir = os.path.abspath(os.path.join(util_dir, '..', 'config', 'test.yaml'))
        with open(os.path.join('config', config_file), 'r') as file:
            config = yaml.safe_load(file)
            config.update({'encoder_classes': encoder.classes_.tolist()})
        with open(os.path.join('config', config_file), 'w') as file:
            documents = yaml.safe_dump(config, file)

    return y_label_encoded

In [15]:
y_test_feng = label_encoder(y_test.values.ravel(), config['target_classes'], save_encoder_classes=True, config_file='config.yaml')
y_train_feng = label_encoder(y_train.values.ravel(), config['target_classes'])
y_val_feng = label_encoder(y_val.values.ravel(), config['target_classes'])

### Preprocessed data pickle dumping

In [16]:
utils.pickle_dump(X_train_feng, config['train_feng_set_path'][0])
utils.pickle_dump(y_train_feng, config['train_feng_set_path'][1])
utils.pickle_dump(X_test_feng, config['test_feng_set_path'][0])
utils.pickle_dump(y_test_feng, config['test_feng_set_path'][1])
utils.pickle_dump(X_val_feng, config['val_feng_set_path'][0])
utils.pickle_dump(y_val_feng, config['val_feng_set_path'][1])