Take the outputs of "clean_data.ipynb" and do the following steps:

* standard scaler
* knn imputer


We would do these as part of our model training process; however, 
`KNNImputer` is slow and takes up a lot of memory.

In [5]:
import pandas as pd
import numpy as np
from copy import deepcopy

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

%config Completer.use_jedi = False

Define Pipeline

In [2]:
# We will scale before imputing
# This is because KNNImputer uses Euclidean distance
# which could be impacted by having columns of different
# scales
scaler = StandardScaler()
knn_imputer = KNNImputer(n_neighbors=5)

steps = [("scaler", scaler),
         ("imputer", knn_imputer)]

preprocess_pipeline = Pipeline(steps=steps)

Apply preprocessing to one file at a time

In [6]:
data_folder = "../data/"
file_list = [ "dev_cleaned", "test_cleaned", "train_cleaned"]


for ix, f in enumerate(file_list):
    
    print(ix)
    
    full_file_name = data_folder + f + ".csv"
    
    df = pd.read_csv(full_file_name)
    all_columns = deepcopy(df.columns)
        
    labels = df.pop("Credit_Score")
    
    # generate transformed data
    final_values = preprocess_pipeline.fit_transform(df)
    
    # Add back data labels
    label_values = labels.values.reshape(-1, 1)
    final_values = np.concatenate([final_values, label_values], axis=1)
    
    # convert back to Pandas Dataframe
    final_df = pd.DataFrame(final_values, columns=all_columns)
    
    # write to file
    final_file_name = data_folder + f + "_imputed.csv"
    final_df.to_csv(final_file_name, header=True, index=False)

0
1
2
