In [1]:
import sys
main_path = "../.."
sys.path.append(main_path)

import numpy as np
import pandas as pd
from impyute.imputation.cs import fast_knn
import os
import yaml
import copy

In [2]:
data = pd.read_csv(f'{main_path}/data/raw/colon_cancer.txt', sep="\t", encoding = "ISO-8859-1")
targets = np.ravel(data.iloc[:,0])
targets = np.where(targets == 'kontrol', 0, 1)
features = np.array(data.iloc[:,2:13])
columns = data.columns[2:13]

with open(f'{main_path}/config.yaml', 'r') as f:
    config = yaml.safe_load(f)
    
decimal_forms = config['decimal']

df_path = f"{main_path}/data/processed/dataframes"
os.makedirs(df_path, exist_ok=True)

### Baseline Approach

In [3]:
baseline_data = data[['mdm2', 'GAL3', 'TIM1', 'p16540', 'p16580']]
one_hot_baseline = pd.get_dummies(baseline_data)
one_hot_baseline.to_csv(f"{df_path}/baseline_approach.csv", index=False)
pd.DataFrame(targets, columns=['target']).to_csv(f"{df_path}/targets.csv", index=False)

### One-Hot Encoding (on categorical data) -> Data Imputation

In [4]:
ohe = pd.get_dummies(data.iloc[:,2:13])

for clm in columns:
    ohe.loc[data[clm].isnull(), ohe.columns.str.startswith("{}_".format(clm))] = np.nan

In [5]:
ohe_di = fast_knn(ohe.values, k=30)

ohe_di_df = pd.DataFrame(ohe_di, columns=ohe.columns)
ohe_di_df.to_csv(f"{df_path}/ohe_di.csv", index=False)

### One-Hot Encoding (on categorical data) -> Data Imputation -> Float to Int

In [6]:
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
# K-nn implementation gives us float values and we convert it to nearest integer
ohe_di_fi = np.round(ohe_di).astype('int')
ohe_di_fi_df = pd.DataFrame(ohe_di_fi, columns=ohe.columns)
ohe_di_fi_df.to_csv(f"{df_path}/ohe_di_fi.csv", index=False)

### Custom Data Transformation (CDT) -> Data Imputation

In [7]:
cdt = pd.DataFrame(features)
cdt.columns = columns
for col in cdt.columns:
    cdt[col] = cdt[col].map(decimal_forms)

In [8]:
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
cdt_di = fast_knn(cdt.values, k=30)

cdt_di_df = pd.DataFrame(cdt_di, columns=columns)
cdt_di_df.to_csv(f"{df_path}/cdt_di.csv", index=False)

### Custom Data Transformation -> Data Imputation -> Float to Int

In [9]:
cdt_di_fi = np.round(cdt_di).astype('int')
cdt_di_fi_df = pd.DataFrame(cdt_di_fi, columns=columns)
cdt_di_fi_df.to_csv(f"{df_path}/cdt_di_fi.csv", index=False)

### Custom Data Transformation -> Data Imputation -> One Hot Encoding

In [10]:
cdt_di_ohe_df = pd.get_dummies(cdt_di_df, sparse=True, columns=columns)
cdt_di_ohe_df.to_csv(f"{df_path}/cdt_di_ohe.csv", index=False)

### Custom Data Transformation (CDT) -> Data Imputation -> Float to Int -> One-Hot Encoding

In [11]:
cdt_di_fi_ohe_df = pd.get_dummies(cdt_di_fi_df.astype('str'), sparse=True, columns=columns)
cdt_di_fi_ohe_df.to_csv(f"{df_path}/cdt_di_fi_ohe.csv", index=False)

# Sources:

- https://towardsdatascience.com/the-use-of-knn-for-missing-values-cf33d935c637