# Fill the void

Refs: Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 Pages 520-525.


In [None]:
import os
if not os.path.exists("README.md"):
    os.chdir("../")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

degenes = pd.read_csv('./data/final/degenes.csv', index_col=0)
pdata = pd.read_csv('./data/final/pData.csv', index_col=0)
degenes_t = degenes.T
degenes_t.columns = [x.split('///')[0] for x in degenes_t.columns]
degenes = degenes_t.T
degenes = degenes/10
degenes_t = degenes.T/10

# merge degenes and pdata
degenes_t = degenes_t.merge(pdata, left_index=True, right_index=True)
degenes_t


In [None]:
# put cancer status to nan if in the row there are nan
degenes_t.loc[degenes_t.isnull().any(axis=1), 'cancer_status'] = 1
degenes_t.loc[degenes_t.isnull().any(axis=1), 'cancer_status']

degenes_t

In [None]:
#count nan rows
degenes_t.isnull().any(axis=1).sum()

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5, weights="distance")
degenes_t_filled = imputer.fit_transform(degenes_t)
degenes_t_filled = pd.DataFrame(degenes_t_filled, index=degenes_t.index, columns=degenes_t.columns)

In [None]:
degenes_t_filled

In [None]:
# round gender, age, packyears, hemopytis, lymphadenopathy, smoking_status, subjective_assessment columns

to_round = ['gender', 'age', 'packyears', 'hemopytsis', 'lymphadenopathy', 'smoking_status', 'subjective_assessment', '>3cm']
degenes_t_filled[to_round] = degenes_t_filled[to_round].round(0)
degenes_t_filled

In [None]:
# save to csv
degenes_t_filled.to_csv('./data/final/degenes_with_pdata_filled.csv')

# save only pdata columns that were filled
degenes_t_filled_pdata = degenes_t_filled.iloc[:, -10:]
degenes_t_filled_pdata.to_csv('./data/final/pdata_filled.csv')