# Fill the void

Refs: Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 Pages 520-525.


In [2]:
import os
if not os.path.exists("README.md"):
    os.chdir("../")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

degenes = pd.read_csv('./data/final/degenes.csv', index_col=0)
pdata = pd.read_csv('./data/final/pData.csv', index_col=0)
degenes_t = degenes.T
degenes_t.columns = [x.split('///')[0] for x in degenes_t.columns]
degenes = degenes_t.T
degenes = degenes/10
degenes_t = degenes.T/10

# merge degenes and pdata
degenes_t = degenes_t.merge(pdata, left_index=True, right_index=True)
degenes_t


Unnamed: 0,GSDMB,TXN,SLC5A1,NSUN3,HSPA13,SOX9,ZKSCAN5,AMACR,LOC101060275,LOC101928625,...,age,biomarker_score,cancer_status,gender,>3cm,packyears,hemopytsis,lymphadenopathy,smoking_status,subjective_assessment
GSM93997,0.140096,0.235407,0.171271,0.183405,0.155741,0.132806,0.133945,0.136854,0.201459,0.188271,...,34.0,-2.253540,0,1.0,0.0,17.0,0.0,0.0,0.0,0.0
GSM94019,0.139721,0.220102,0.178908,0.183668,0.138633,0.150779,0.125344,0.139069,0.217829,0.154159,...,63.0,8.900589,1,1.0,1.0,75.0,0.0,1.0,0.0,1.0
GSM94020,0.143053,0.202522,0.179960,0.181274,0.130426,0.155367,0.127912,0.137341,0.207420,0.142284,...,,,1,,,,,,,
GSM94021,0.158784,0.229173,0.184660,0.181362,0.143928,0.161549,0.123178,0.138843,0.211655,0.181317,...,69.0,-3.460146,1,1.0,0.0,70.0,0.0,1.0,1.0,1.0
GSM94022,0.141694,0.231324,0.180819,0.183465,0.151670,0.161156,0.127023,0.128559,0.207278,0.159797,...,61.0,1.543728,1,1.0,0.0,80.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM98871,0.141979,0.191723,0.171112,0.182800,0.150196,0.160113,0.152147,0.140221,0.201324,0.155077,...,,,1,,,,,,,
GSM98872,0.149537,0.156204,0.179963,0.195238,0.144343,0.159775,0.136298,0.132398,0.168837,0.156908,...,,,1,,,,,,,
GSM98873,0.148468,0.215223,0.176867,0.190107,0.145777,0.151258,0.134320,0.142411,0.177251,0.171169,...,,,1,,,,,,,
GSM98874,0.148544,0.176236,0.179011,0.189968,0.141696,0.163867,0.143968,0.129607,0.184133,0.165384,...,,,1,,,,,,,


In [3]:
# put cancer status to nan if in the row there are nan
degenes_t.loc[degenes_t.isnull().any(axis=1), 'cancer_status'] = 1
degenes_t.loc[degenes_t.isnull().any(axis=1), 'cancer_status']

degenes_t

Unnamed: 0,GSDMB,TXN,SLC5A1,NSUN3,HSPA13,SOX9,ZKSCAN5,AMACR,LOC101060275,LOC101928625,...,age,biomarker_score,cancer_status,gender,>3cm,packyears,hemopytsis,lymphadenopathy,smoking_status,subjective_assessment
GSM93997,0.140096,0.235407,0.171271,0.183405,0.155741,0.132806,0.133945,0.136854,0.201459,0.188271,...,34.0,-2.253540,0,1.0,0.0,17.0,0.0,0.0,0.0,0.0
GSM94019,0.139721,0.220102,0.178908,0.183668,0.138633,0.150779,0.125344,0.139069,0.217829,0.154159,...,63.0,8.900589,1,1.0,1.0,75.0,0.0,1.0,0.0,1.0
GSM94020,0.143053,0.202522,0.179960,0.181274,0.130426,0.155367,0.127912,0.137341,0.207420,0.142284,...,,,1,,,,,,,
GSM94021,0.158784,0.229173,0.184660,0.181362,0.143928,0.161549,0.123178,0.138843,0.211655,0.181317,...,69.0,-3.460146,1,1.0,0.0,70.0,0.0,1.0,1.0,1.0
GSM94022,0.141694,0.231324,0.180819,0.183465,0.151670,0.161156,0.127023,0.128559,0.207278,0.159797,...,61.0,1.543728,1,1.0,0.0,80.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM98871,0.141979,0.191723,0.171112,0.182800,0.150196,0.160113,0.152147,0.140221,0.201324,0.155077,...,,,1,,,,,,,
GSM98872,0.149537,0.156204,0.179963,0.195238,0.144343,0.159775,0.136298,0.132398,0.168837,0.156908,...,,,1,,,,,,,
GSM98873,0.148468,0.215223,0.176867,0.190107,0.145777,0.151258,0.134320,0.142411,0.177251,0.171169,...,,,1,,,,,,,
GSM98874,0.148544,0.176236,0.179011,0.189968,0.141696,0.163867,0.143968,0.129607,0.184133,0.165384,...,,,1,,,,,,,


In [4]:
#count nan rows
degenes_t.isnull().any(axis=1).sum()

29

In [5]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5, weights="distance")
degenes_t_filled = imputer.fit_transform(degenes_t)
degenes_t_filled = pd.DataFrame(degenes_t_filled, index=degenes_t.index, columns=degenes_t.columns)

In [6]:
degenes_t_filled

Unnamed: 0,GSDMB,TXN,SLC5A1,NSUN3,HSPA13,SOX9,ZKSCAN5,AMACR,LOC101060275,LOC101928625,...,age,biomarker_score,cancer_status,gender,>3cm,packyears,hemopytsis,lymphadenopathy,smoking_status,subjective_assessment
GSM93997,0.140096,0.235407,0.171271,0.183405,0.155741,0.132806,0.133945,0.136854,0.201459,0.188271,...,34.000000,-2.253540,0.0,1.000000,0.000000,17.000000,0.0,0.000000,0.000000,0.000000
GSM94019,0.139721,0.220102,0.178908,0.183668,0.138633,0.150779,0.125344,0.139069,0.217829,0.154159,...,63.000000,8.900589,1.0,1.000000,1.000000,75.000000,0.0,1.000000,0.000000,1.000000
GSM94020,0.143053,0.202522,0.179960,0.181274,0.130426,0.155367,0.127912,0.137341,0.207420,0.142284,...,61.265702,9.954619,1.0,0.810986,0.587714,79.819851,0.0,0.799501,0.196126,0.799501
GSM94021,0.158784,0.229173,0.184660,0.181362,0.143928,0.161549,0.123178,0.138843,0.211655,0.181317,...,69.000000,-3.460146,1.0,1.000000,0.000000,70.000000,0.0,1.000000,1.000000,1.000000
GSM94022,0.141694,0.231324,0.180819,0.183465,0.151670,0.161156,0.127023,0.128559,0.207278,0.159797,...,61.000000,1.543728,1.0,1.000000,0.000000,80.000000,0.0,1.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM98871,0.141979,0.191723,0.171112,0.182800,0.150196,0.160113,0.152147,0.140221,0.201324,0.155077,...,62.480836,6.628599,1.0,0.788076,0.418061,67.022229,0.0,0.793864,0.204471,0.793864
GSM98872,0.149537,0.156204,0.179963,0.195238,0.144343,0.159775,0.136298,0.132398,0.168837,0.156908,...,61.765367,3.834675,1.0,0.802547,0.601698,71.920097,0.0,0.791570,0.203087,0.791570
GSM98873,0.148468,0.215223,0.176867,0.190107,0.145777,0.151258,0.134320,0.142411,0.177251,0.171169,...,65.764222,-0.549258,1.0,0.805342,0.592084,60.644220,0.0,0.603276,0.395435,0.797935
GSM98874,0.148544,0.176236,0.179011,0.189968,0.141696,0.163867,0.143968,0.129607,0.184133,0.165384,...,65.216037,6.642173,1.0,0.788539,0.596169,65.447612,0.0,0.615293,0.221868,0.795949


In [7]:
# round gender, age, packyears, hemopytis, lymphadenopathy, smoking_status, subjective_assessment columns

to_round = ['gender', 'age', 'packyears', 'hemopytsis', 'lymphadenopathy', 'smoking_status', 'subjective_assessment', '>3cm']
degenes_t_filled[to_round] = degenes_t_filled[to_round].round(0)
degenes_t_filled

Unnamed: 0,GSDMB,TXN,SLC5A1,NSUN3,HSPA13,SOX9,ZKSCAN5,AMACR,LOC101060275,LOC101928625,...,age,biomarker_score,cancer_status,gender,>3cm,packyears,hemopytsis,lymphadenopathy,smoking_status,subjective_assessment
GSM93997,0.140096,0.235407,0.171271,0.183405,0.155741,0.132806,0.133945,0.136854,0.201459,0.188271,...,34.0,-2.253540,0.0,1.0,0.0,17.0,0.0,0.0,0.0,0.0
GSM94019,0.139721,0.220102,0.178908,0.183668,0.138633,0.150779,0.125344,0.139069,0.217829,0.154159,...,63.0,8.900589,1.0,1.0,1.0,75.0,0.0,1.0,0.0,1.0
GSM94020,0.143053,0.202522,0.179960,0.181274,0.130426,0.155367,0.127912,0.137341,0.207420,0.142284,...,61.0,9.954619,1.0,1.0,1.0,80.0,0.0,1.0,0.0,1.0
GSM94021,0.158784,0.229173,0.184660,0.181362,0.143928,0.161549,0.123178,0.138843,0.211655,0.181317,...,69.0,-3.460146,1.0,1.0,0.0,70.0,0.0,1.0,1.0,1.0
GSM94022,0.141694,0.231324,0.180819,0.183465,0.151670,0.161156,0.127023,0.128559,0.207278,0.159797,...,61.0,1.543728,1.0,1.0,0.0,80.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM98871,0.141979,0.191723,0.171112,0.182800,0.150196,0.160113,0.152147,0.140221,0.201324,0.155077,...,62.0,6.628599,1.0,1.0,0.0,67.0,0.0,1.0,0.0,1.0
GSM98872,0.149537,0.156204,0.179963,0.195238,0.144343,0.159775,0.136298,0.132398,0.168837,0.156908,...,62.0,3.834675,1.0,1.0,1.0,72.0,0.0,1.0,0.0,1.0
GSM98873,0.148468,0.215223,0.176867,0.190107,0.145777,0.151258,0.134320,0.142411,0.177251,0.171169,...,66.0,-0.549258,1.0,1.0,1.0,61.0,0.0,1.0,0.0,1.0
GSM98874,0.148544,0.176236,0.179011,0.189968,0.141696,0.163867,0.143968,0.129607,0.184133,0.165384,...,65.0,6.642173,1.0,1.0,1.0,65.0,0.0,1.0,0.0,1.0


In [8]:
# save to csv
degenes_t_filled.to_csv('./data/final/degenes_with_pdata_filled.csv')

# save only pdata columns that were filled
degenes_t_filled_pdata = degenes_t_filled.iloc[:, -10:]
degenes_t_filled_pdata.to_csv('./data/final/pdata_filled.csv')