<a href="https://colab.research.google.com/github/itdusty/blood_cells_classification/blob/main/data_preparation_module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data preparation
This module turns filesystem from 'dataset/{video_name}/{frame_name}' to 'dataset/{class}/{video_name}/{frame_name}'.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
import shutil
import pandas as pd
import numpy as np

In [None]:
working_dir = "/content/drive/MyDrive/Cells classification"

In [None]:
os.chdir(working_dir)
os.getcwd()

'/content/drive/MyDrive/Cells classification'

In [None]:
old_dataset = 'dataset'
new_dataset = 'prepared_dataset'

In [None]:
dataframe = pd.read_csv('DataFrame.csv')
dataframe

Unnamed: 0.1,Unnamed: 0,files,erythrocytes,lymphocytes,high_erythrocytes,high_lymphocytes,blur,noise
0,0,traffic1,9,9,0,1,0,0
1,1,traffic2,13,3,1,0,0,0
2,2,traffic3,17,11,1,1,0,0
3,3,traffic4,1,5,0,0,0,0
4,4,traffic5,14,8,1,1,0,0


In [None]:
shutil.rmtree(f"{working_dir}/{new_dataset}")

In [None]:
if not os.path.isdir(new_dataset):
    os.mkdir(new_dataset)
    os.chdir(new_dataset)
    os.mkdir('high_rbc_high_wbc')
    os.mkdir('high_rbc_low_wbc')
    os.mkdir('low_rbc_high_wbc')
    os.mkdir('low_rbc_low_wbc')
    os.chdir("..")
os.getcwd()

'/content/drive/MyDrive/Cells classification'

In [None]:
for index, row in dataframe.iterrows():
  if row['high_erythrocytes'] == 1 and row['high_lymphocytes'] == 1:
    class_name = 'high_rbc_high_wbc'
  elif row['high_erythrocytes'] == 1 and row['high_lymphocytes'] == 0:
    class_name = 'high_rbc_low_wbc'
  elif row['high_erythrocytes'] == 0 and row['high_lymphocytes'] == 1:
    class_name = 'low_rbc_high_wbc'
  else:
    class_name = 'low_rbc_low_wbc'

  shutil.copytree(src=f"{working_dir}/{old_dataset}/{row['files']}", dst=f"{working_dir}/{new_dataset}/{class_name}/{row['files']}")
  print(row['files'])

traffic1
traffic2
traffic3
traffic4
traffic5


### Train/test split

This module will transform dataset to be used as LabeledVideoDataset (pytorchvideo.data)

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import os
import shutil
from torch.utils.data import DataLoader

In [6]:
dataset_path = "/content/drive/MyDrive/Cells classification/prepared_dataset"

In [7]:
os.chdir(dataset_path)
folders = os.listdir()
files_list = {
    "file": [],
    "label": [],
    "class_name": []
}
classes_codes = {}

In [8]:
for i, folder in enumerate(folders):
    os.chdir(folder)
    file_names = os.listdir()
    files_list["file"] += file_names
    files_list["label"] += [i for _ in range(len(file_names))]
    files_list["class_name"] += [folder for _ in range(len(file_names))]
    classes_codes[folder] = i
    os.chdir("..")
os.chdir("..")

In [9]:
classes_codes

{'high_rbc_high_wbc': 0,
 'high_rbc_low_wbc': 1,
 'low_rbc_high_wbc': 2,
 'low_rbc_low_wbc': 3}

In [10]:
df_for_split = pd.DataFrame(files_list)
df_for_split

Unnamed: 0,file,label,class_name
0,traffic3,0,high_rbc_high_wbc
1,traffic5,0,high_rbc_high_wbc
2,traffic10,0,high_rbc_high_wbc
3,traffic11,0,high_rbc_high_wbc
4,traffic13,0,high_rbc_high_wbc
...,...,...,...
995,traffic990,3,low_rbc_low_wbc
996,traffic993,3,low_rbc_low_wbc
997,traffic994,3,low_rbc_low_wbc
998,traffic997,3,low_rbc_low_wbc


In [11]:
df_for_split = df_for_split.sample(frac=1)
ratio = 0.8
train_size = int(df_for_split.shape[0] * ratio)
train_data = df_for_split[0:train_size]
test_data = df_for_split[train_size:]
train_data, test_data

(           file  label        class_name
 690  traffic939      2  low_rbc_high_wbc
 293  traffic293      1  high_rbc_low_wbc
 810  traffic326      3   low_rbc_low_wbc
 237   traffic72      1  high_rbc_low_wbc
 400  traffic758      1  high_rbc_low_wbc
 ..          ...    ...               ...
 940  traffic802      3   low_rbc_low_wbc
 425  traffic859      1  high_rbc_low_wbc
 537  traffic371      2  low_rbc_high_wbc
 803  traffic315      3   low_rbc_low_wbc
 600  traffic600      2  low_rbc_high_wbc
 
 [800 rows x 3 columns],
            file  label         class_name
 21    traffic76      0  high_rbc_high_wbc
 468   traffic77      2   low_rbc_high_wbc
 147  traffic678      0  high_rbc_high_wbc
 444  traffic944      1   high_rbc_low_wbc
 440  traffic923      1   high_rbc_low_wbc
 ..          ...    ...                ...
 768  traffic212      3    low_rbc_low_wbc
 561  traffic475      2   low_rbc_high_wbc
 597  traffic588      2   low_rbc_high_wbc
 103  traffic486      0  high_rbc_high_

In [12]:
old_dataset_name = "prepared_dataset"
dataset_name = "cells_dataset"
path = "/content/drive/MyDrive/Cells classification"

In [14]:
os.chdir(path)
if os.path.isdir(dataset_name):
    shutil.rmtree(dataset_name)

In [15]:
if not os.path.isdir(dataset_name):
    os.mkdir(dataset_name)
os.chdir(dataset_name)

if not os.path.isdir("train"):
    os.mkdir("train")
    os.chdir("train")
    for folder in folders:
        os.mkdir(folder)
    os.chdir("..")
if not os.path.isdir("test"):
    os.mkdir("test")
    os.chdir("test")
    for folder in folders:
        os.mkdir(folder)
    os.chdir("..")

inner_folder = ""
print("Train")
for row in train_data.iterrows():
    filename = row[1].file
    inner_folder = row[1].class_name
    shutil.copytree(f"{path}/{old_dataset_name}/{inner_folder}/{filename}", f"{path}/{dataset_name}/train/{inner_folder}/{filename}")
    print(filename)

print("Test")
for row in test_data.iterrows():
    filename = row[1].file
    inner_folder = row[1].class_name
    shutil.copytree(f"{path}/{old_dataset_name}/{inner_folder}/{filename}", f"{path}/{dataset_name}/test/{inner_folder}/{filename}")
    print(filename)


Train
traffic939
traffic293
traffic326
traffic72
traffic758
traffic687
traffic327
traffic253
traffic64
traffic788
traffic962
traffic179
traffic112
traffic241
traffic623
traffic768
traffic684
traffic97
traffic910
traffic543
traffic130
traffic598
traffic194
traffic156
traffic85
traffic849
traffic659
traffic911
traffic119
traffic447
traffic779
traffic858
traffic238
traffic149
traffic708
traffic786
traffic412
traffic227
traffic940
traffic883
traffic900
traffic844
traffic540
traffic276
traffic388
traffic822
traffic934
traffic40
traffic510
traffic363
traffic133
traffic341
traffic538
traffic818
traffic163
traffic422
traffic889
traffic188
traffic759
traffic545
traffic791
traffic888
traffic481
traffic328
traffic258
traffic583
traffic228
traffic31
traffic960
traffic597
traffic764
traffic246
traffic736
traffic69
traffic547
traffic308
traffic699
traffic807
traffic4
traffic869
traffic695
traffic294
traffic770
traffic280
traffic84
traffic956
traffic654
traffic542
traffic892
traffic307
traffic71
traf