## Test Balancer
##### Test script to balance dataset

In [217]:
import pandas as pd
from pathlib import Path
from sklearn.utils import shuffle
# Get the metadata
META = Path(".").resolve().parent.joinpath("metadata.csv")
df = pd.read_csv(META)

In [218]:
df.columns

Index(['patientid', 'sex', 'age', 'finding', 'modality', 'filename'], dtype='object')

In [219]:
def ViewFindingBalance():
    # View the balances
    print(f"Overall Balance:\n{df['finding'].value_counts()}\n")
    print(f"Balance for CXR:\n{df.loc[df['modality'] == 'X-ray', 'finding'].value_counts()}\n")
    print(f"Balance for CXR:\n{df.loc[df['modality'] == 'CT', 'finding'].value_counts()}")

##### Undersample the dataset based on the findings ie COVID / NON-COVID

In [220]:
# View the current balance
ViewFindingBalance()

Overall Balance:
NON-COVID    56068
COVID-19     38553
Name: finding, dtype: int64

Balance for CXR:
NON-COVID    532
COVID-19     287
Name: finding, dtype: int64

Balance for CXR:
NON-COVID    55536
COVID-19     38266
Name: finding, dtype: int64


In [221]:
# Undersample the majority class
g = df.groupby("finding")
df = g.apply(lambda x: x.sample(g.size().min()))

In [222]:
# Check if the balanced is the same for both classes
ViewFindingBalance()

Overall Balance:
COVID-19     38553
NON-COVID    38553
Name: finding, dtype: int64

Balance for CXR:
NON-COVID    357
COVID-19     287
Name: finding, dtype: int64

Balance for CXR:
COVID-19     38266
NON-COVID    38196
Name: finding, dtype: int64


In [223]:
# Check to see if it's grouped properly
df

Unnamed: 0_level_0,Unnamed: 1_level_0,patientid,sex,age,finding,modality,filename
finding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
COVID-19,78507,volume-covid19-A-0196_0,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
COVID-19,14003,NCP_32,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
COVID-19,21939,NCP_446,M,51,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
COVID-19,8432,NCP_233,M,0,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
COVID-19,23214,NCP_48,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
...,...,...,...,...,...,...,...
NON-COVID,63136,Normal_3911,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
NON-COVID,39371,Normal_1711,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
NON-COVID,62098,Normal_3905,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
NON-COVID,74283,Normal_799,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...


In [224]:
# Remove the group so that the dataframe can be shuffled
df = df.reset_index(drop=True)
df

Unnamed: 0,patientid,sex,age,finding,modality,filename
0,volume-covid19-A-0196_0,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
1,NCP_32,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
2,NCP_446,M,51,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
3,NCP_233,M,0,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
4,NCP_48,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
...,...,...,...,...,...,...
77101,Normal_3911,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
77102,Normal_1711,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
77103,Normal_3905,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
77104,Normal_799,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...


In [225]:
df = shuffle(df)
df

Unnamed: 0,patientid,sex,age,finding,modality,filename
20072,NCP_3948,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
22675,NCP_408,F,54,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
74211,Normal_1670,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
26867,volume-covid19-A-0586,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
70851,Normal_1752,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
...,...,...,...,...,...,...
56236,Normal_1670,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
73580,Normal_1735,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
54939,patient307,M,26,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
6727,NCP_457,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...


##### One liner test

In [226]:
df = shuffle(g.apply(lambda x: x.sample(g.size().min())).reset_index(drop=True))
df

Unnamed: 0,patientid,sex,age,finding,modality,filename
10214,patient145,M,56,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
30906,NCP_336,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
15092,NCP_408,F,54,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
62471,Normal_787,Unknown,Unknown,NON-COVID,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
6169,patient124,F,64,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
...,...,...,...,...,...,...
8377,volume-covid19-A-0392,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
9031,patient97,M,42,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
14235,NCP_392,Unknown,Unknown,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
25763,NCP_326,F,69,COVID-19,CT,C:\Users\Kyle\Desktop\RGU Coursework\Thesis\Ky...
