In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer # For imputing missing values using K-Nearest Neighbors
import random
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
random.seed(10)

df = pd.read_csv('marksheet.csv')
print(len(df))
print(df.isna().sum())
print(df.head())
df.columns[df.isnull().any()]

1500
patient_id           0
study_id             0
mri_date             0
patient_age          0
psa                 40
psad               451
prostate_volume     27
histopath_type     499
lesion_GS          499
lesion_ISUP        499
case_ISUP            0
case_csPCa           0
dtype: int64
   patient_id  study_id    mri_date  patient_age   psa  psad  prostate_volume  \
0       10000   1000000  2019-07-02           73   7.7   NaN             55.0   
1       10001   1000001  2016-05-27           64   8.7  0.09            102.0   
2       10002   1000002  2021-04-18           58   4.2  0.06             74.0   
3       10003   1000003  2019-04-05           72  13.0   NaN             71.5   
4       10004   1000004  2020-10-21           67   8.0  0.10             78.0   

  histopath_type lesion_GS lesion_ISUP  case_ISUP case_csPCa  
0           MRBx       0+0           0          0         NO  
1            NaN       NaN         NaN          0         NO  
2            NaN       NaN    

Index(['psa', 'psad', 'prostate_volume', 'histopath_type', 'lesion_GS',
       'lesion_ISUP'],
      dtype='object')

In [3]:
# We drop the biopsy columns
df.drop(['histopath_type', 'lesion_GS', 'lesion_ISUP', 'case_ISUP'], inplace=True, axis=1)
df.columns[df.isnull().any()]


Index(['psa', 'psad', 'prostate_volume'], dtype='object')

In [4]:
# If 2 out of ['psa', 'psad', 'prostate_volume'] are present, we can calculate the remaining value
def fill_missing(row):
    nr_missing = row.isnull().sum()
    if nr_missing != 1:
        return row
    elif pd.notnull(row['psa']) and pd.notnull(row['prostate_volume']):
        row['psad'] = round(row['psa'] / row['prostate_volume'], 2)
        return row
    elif pd.notnull(row['psa']) and pd.notnull(row['psad']):
        row['prostate_volume'] = round(row['psa'] / row['psad'], 2)
        return row
    else:
        row['psa'] = round(row['psad'] * row['prostate_volume'], 2)
        return row

# Apply the function to fill missing values
df = df.apply(fill_missing, axis=1)

In [5]:
print(df.isna().sum())
df.to_csv('marksheet_psad_computed.csv', index=False)

patient_id          0
study_id            0
mri_date            0
patient_age         0
psa                33
psad               52
prostate_volume    25
case_csPCa          0
dtype: int64


In [6]:
# KNN
num_features = ['patient_age', 'psa', 'psad', 'prostate_volume']
cols = list(df.columns)
k = 3
knn_imputer = KNNImputer(n_neighbors=k)
df_filled_knn = df.copy()
df_filled_knn[num_features] = pd.DataFrame(knn_imputer.fit_transform(df[num_features]), columns=num_features)

df_filled_knn = df_filled_knn[cols]
for col in ['psa', 'psad', 'prostate_volume']:
    df_filled_knn[col] = df_filled_knn[col].astype(float).round(2)
print(len(df_filled_knn))
print(df_filled_knn.isna().sum())
print(df_filled_knn.head())
df_filled_knn.to_csv('marksheet_filled_knn.csv', index=False)
# X_test_imputed = pd.DataFrame(knn_imputer.transform(X_test), columns=X_test.columns)

1500
patient_id         0
study_id           0
mri_date           0
patient_age        0
psa                0
psad               0
prostate_volume    0
case_csPCa         0
dtype: int64
   patient_id  study_id    mri_date  patient_age   psa  psad  prostate_volume  \
0       10000   1000000  2019-07-02         73.0   7.7  0.14             55.0   
1       10001   1000001  2016-05-27         64.0   8.7  0.09            102.0   
2       10002   1000002  2021-04-18         58.0   4.2  0.06             74.0   
3       10003   1000003  2019-04-05         72.0  13.0  0.18             71.5   
4       10004   1000004  2020-10-21         67.0   8.0  0.10             78.0   

  case_csPCa  
0         NO  
1         NO  
2         NO  
3         NO  
4         NO  


In [7]:
# Linear Regression
imputer = IterativeImputer(estimator=BayesianRidge())
df_filled_reg = df.copy()
df_filled_reg[num_features] = pd.DataFrame(imputer.fit_transform(df[num_features]), columns=num_features)

df_filled_reg = df_filled_reg[cols]
for col in ['psa', 'psad', 'prostate_volume']:
    df_filled_reg[col] = df_filled_reg[col].astype(float).round(2)
print(df_filled_reg.isna().sum())
print(df_filled_reg.head())
df_filled_reg.to_csv('marksheet_filled_regression.csv', index=False)

patient_id         0
study_id           0
mri_date           0
patient_age        0
psa                0
psad               0
prostate_volume    0
case_csPCa         0
dtype: int64
   patient_id  study_id    mri_date  patient_age   psa  psad  prostate_volume  \
0       10000   1000000  2019-07-02         73.0   7.7  0.14             55.0   
1       10001   1000001  2016-05-27         64.0   8.7  0.09            102.0   
2       10002   1000002  2021-04-18         58.0   4.2  0.06             74.0   
3       10003   1000003  2019-04-05         72.0  13.0  0.18             71.5   
4       10004   1000004  2020-10-21         67.0   8.0  0.10             78.0   

  case_csPCa  
0         NO  
1         NO  
2         NO  
3         NO  
4         NO  


In [8]:
# df_filled_knn = pd.read_csv("marksheet_filled_knn.csv")
df_filled_knn.equals(df_filled_reg)

False