In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import json
from sklearn.impute import KNNImputer # For imputing missing values using K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors classification model
from sklearn.model_selection import train_test_split
import random
from sklearn.preprocessing import LabelEncoder

In [None]:
random.seed(10)

df = pd.read_csv('/content/marksheet.csv')
cols = list(df.columns)
print(len(df))
print(df.isna().sum())
print(df.head())

1500
patient_id           0
study_id             0
mri_date             0
patient_age          0
psa                 40
psad               451
prostate_volume     27
histopath_type     499
lesion_GS          499
lesion_ISUP        499
case_ISUP            0
case_csPCa           0
dtype: int64
   patient_id  study_id    mri_date  patient_age   psa  psad  prostate_volume  \
0       10000   1000000  2019-07-02           73   7.7   NaN             55.0   
1       10001   1000001  2016-05-27           64   8.7  0.09            102.0   
2       10002   1000002  2021-04-18           58   4.2  0.06             74.0   
3       10003   1000003  2019-04-05           72  13.0   NaN             71.5   
4       10004   1000004  2020-10-21           67   8.0  0.10             78.0   

  histopath_type lesion_GS lesion_ISUP  case_ISUP case_csPCa  
0           MRBx       0+0           0          0         NO  
1            NaN       NaN         NaN          0         NO  
2            NaN       NaN    

In [None]:
# filling columns of numerical values
num_features = ['patient_age', 'psa', 'psad', 'prostate_volume']
k = 3
knn_imputer = KNNImputer(n_neighbors=k)
df_filled = df
df_filled[num_features] = pd.DataFrame(knn_imputer.fit_transform(df[num_features]), columns=num_features)

df_filled = df_filled[cols]
print(len(df_filled))
print(df_filled.isna().sum())
print(df_filled.head())
df_filled.to_csv('marksheet_filled_numerical.csv', index=False)
# X_test_imputed = pd.DataFrame(knn_imputer.transform(X_test), columns=X_test.columns)

1500
patient_id           0
study_id             0
mri_date             0
patient_age          0
psa                  0
psad                 0
prostate_volume      0
histopath_type     499
lesion_GS          499
lesion_ISUP        499
case_ISUP            0
case_csPCa           0
dtype: int64
   patient_id  study_id    mri_date  patient_age   psa      psad  \
0       10000   1000000  2019-07-02         73.0   7.7  0.156667   
1       10001   1000001  2016-05-27         64.0   8.7  0.090000   
2       10002   1000002  2021-04-18         58.0   4.2  0.060000   
3       10003   1000003  2019-04-05         72.0  13.0  0.170000   
4       10004   1000004  2020-10-21         67.0   8.0  0.100000   

   prostate_volume histopath_type lesion_GS lesion_ISUP  case_ISUP case_csPCa  
0             55.0           MRBx       0+0           0          0         NO  
1            102.0            NaN       NaN         NaN          0         NO  
2             74.0            NaN       NaN         NaN  

In [None]:
# encoding categorical variables as numerical labels

num_label_features = ['patient_age', 'psa', 'psad', 'prostate_volume', 'histopath_type_encoded', 'lesion_GS_encoded', 'lesion_ISUP_encoded']
df_filled = pd.read_csv('/content/marksheet.csv')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'hystopath' column
df_filled['histopath_type_encoded'] = label_encoder.fit_transform(df_filled['histopath_type'])
df_filled['lesion_GS_encoded'] = label_encoder.fit_transform(df_filled['lesion_GS'])
df_filled['lesion_ISUP_encoded'] = label_encoder.fit_transform(df_filled['lesion_ISUP'])

print(df_filled.head())

   patient_id  study_id    mri_date  patient_age   psa  psad  prostate_volume  \
0       10000   1000000  2019-07-02           73   7.7   NaN             55.0   
1       10001   1000001  2016-05-27           64   8.7  0.09            102.0   
2       10002   1000002  2021-04-18           58   4.2  0.06             74.0   
3       10003   1000003  2019-04-05           72  13.0   NaN             71.5   
4       10004   1000004  2020-10-21           67   8.0  0.10             78.0   

  histopath_type lesion_GS lesion_ISUP  case_ISUP case_csPCa  \
0           MRBx       0+0           0          0         NO   
1            NaN       NaN         NaN          0         NO   
2            NaN       NaN         NaN          0         NO   
3          SysBx       0+0           0          0         NO   
4     SysBx+MRBx   0+0,0+0         0,0          0         NO   

   histopath_type_encoded  lesion_GS_encoded  lesion_ISUP_encoded  
0                       0                  0                

In [None]:
k = 3
knn_imputer = KNNImputer(n_neighbors=k)
df_filled[num_label_features] = pd.DataFrame(knn_imputer.fit_transform(df_filled[num_label_features]), columns=num_label_features)
print(len(df_filled))
print(df_filled.isna().sum())
print(df_filled.head())

df_filled = df_filled[cols]
print(df_filled.head())
df_filled.to_csv('marksheet_filled.csv', index=False)

1500
patient_id                  0
study_id                    0
mri_date                    0
patient_age                 0
psa                         0
psad                        0
prostate_volume             0
histopath_type            499
lesion_GS                 499
lesion_ISUP               499
case_ISUP                   0
case_csPCa                  0
histopath_type_encoded      0
lesion_GS_encoded           0
lesion_ISUP_encoded         0
dtype: int64
   patient_id  study_id    mri_date  patient_age   psa      psad  \
0       10000   1000000  2019-07-02         73.0   7.7  0.153333   
1       10001   1000001  2016-05-27         64.0   8.7  0.090000   
2       10002   1000002  2021-04-18         58.0   4.2  0.060000   
3       10003   1000003  2019-04-05         72.0  13.0  0.133333   
4       10004   1000004  2020-10-21         67.0   8.0  0.100000   

   prostate_volume histopath_type lesion_GS lesion_ISUP  case_ISUP case_csPCa  \
0             55.0           MRBx       0+

In [None]:
# compare marksheet_filled and marksheet_filled_numerical

df_filled = pd.read_csv('/content/marksheet_filled.csv')
df_filled_numerical = pd.read_csv('/content/marksheet_filled_numerical.csv')

print(df_filled.equals(df_filled_numerical))

False
           psa                psad           prostate_volume      
          self     other      self     other            self other
0          NaN       NaN  0.153333  0.156667             NaN   NaN
3          NaN       NaN  0.133333  0.170000             NaN   NaN
9          NaN       NaN  0.140000  0.206667             NaN   NaN
11         NaN       NaN  0.233333  0.206667             NaN   NaN
14    7.333333  5.666667       NaN       NaN             NaN   NaN
...        ...       ...       ...       ...             ...   ...
1485       NaN       NaN  0.160000  0.060000             NaN   NaN
1487       NaN       NaN  0.120000  0.093333             NaN   NaN
1489       NaN       NaN  0.100000  0.113333             NaN   NaN
1494       NaN       NaN  0.160000  0.166667             NaN   NaN
1498       NaN       NaN  0.130000  0.120000             NaN   NaN

[439 rows x 6 columns]


In [None]:
# fill nan values of one column
# TODO: basically classification
col = 'histopath_type'
mask = pd.notna(df[col])
df_filled = df[mask]
df_to_fill = df[~mask]
print(df_to_fill.head())
# X = df.loc[:, df.columns != col]
# y = df[col]
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random.random())

    patient_id  study_id    mri_date  patient_age       psa  psad  \
1        10001   1000001  2016-05-27         64.0  8.700000  0.09   
2        10002   1000002  2021-04-18         58.0  4.200000  0.06   
10       10010   1000010  2019-05-11         67.0  6.190000  0.08   
14       10014   1000014  2020-10-13         48.0  5.666667  0.07   
16       10016   1000016  2012-05-26         60.0  8.500000  0.13   

    prostate_volume histopath_type lesion_GS lesion_ISUP  case_ISUP case_csPCa  
1             102.0            NaN       NaN         NaN          0         NO  
2              74.0            NaN       NaN         NaN          0         NO  
10             76.0            NaN       NaN         NaN          0         NO  
14             62.0            NaN       NaN         NaN          0         NO  
16             71.0            NaN       NaN         NaN          0         NO  
