# Dataset Cleaning

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/mbarbetti/lymphoma-classification/blob/master/0_dataset_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
  <td>
    <a target="_blank" href="https://kaggle.com/kernels/welcome?src=https://github.com/mbarbetti/lymphoma-classification/blob/master/0_dataset_cleaning.ipynb"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" /></a>
  </td>
</table>

In [1]:
import os
import pandas as pd

data_dir  = "./data"
data_file = "Database_Emato_mediastinalbulky_anonimized.csv"
file_path = os.path.join ( data_dir, data_file )

data = pd . read_csv ( file_path, sep = ";", header = 0 )

## Data adjustment

- empty rows at the end of the database removed
- columns with no clear contents dropped
  - `PARAMS_IntensityResampling`
  - `PARAMS_BoundsRangeOfValueAfterDiscretisation(SUVbw)`
  - `CHECK_Cluster(s)ToSmall`
- data format for dates and float number fixed
- age information recovered

In [2]:
data . dropna ( subset = ["ID"], inplace = True )   # remove empty rows

In [3]:
## data content not clear
cols_to_drop = [
  "PARAMS_IntensityResampling", 
  "PARAMS_BoundsRangeOfValueAfterDiscretisation(SUVbw)", 
  "CHECK_Cluster(s)ToSmall"
]

data . drop ( labels = cols_to_drop, axis = 1, inplace = True )

In [4]:
from utils import data_cleaning

## fixing date format
for cname in [ "Data nascita", "dPET staging" ]:
  data_cleaning ( data, col_name = cname, target_type = "date", inplace = True )

## fixing float number format
cols = list ( data.columns [2:] )
for cname in cols:
  data_cleaning ( data, col_name = cname, target_type = "float", inplace = True )

In [5]:
age = list()

for i in range ( len(data) ):
  age . append ( data["dPET staging"][i].year - data["Data nascita"][i].year )

data["age"] = age   # add age information

data . drop ( labels = ["Data nascita", "dPET staging"], axis = 1, inplace = True )


## Data renaming

- label column renamed to `lymphoma_type`
  - `lymphoma_type = 1.0` --> HL
  - `lymphoma_type = 2.0` --> GZ
  - `lymphoma_type = 3.0` --> PML
- `age` feature moved to the third column

In [8]:
cols = list ( data.columns )
cols[1] = "lymphoma_type"
data . columns = cols

cols . remove ( "age" )
cols = cols[:2] + ["age"] + cols[2:]

data = data[cols]

## NaN filter

**Result:** The NaN values are replaced by the means along the columns.

In [12]:
for col in data.columns:
  mean_value = data[col] . mean()   # mean value
  data[col] . fillna ( value = mean_value, inplace = True )

## Data export

In [13]:
exp_data_file = "db_mediastinalbulky_cleaned.pkl"
exp_file_path = os.path.join ( data_dir, exp_data_file )

data . to_pickle ( exp_file_path )