In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import cv2
import pandas as pd
from torch.utils.data import Dataset
from torchvision.transforms import v2 as transforms
from src.utils.loaders import load_image_dataset, load_images
from sklearn.utils import resample
from PIL import Image
from src.dataset import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_feather("../processed_data/train_df.feather")
print(df["GROUP"].unique())
df["GROUP"].value_counts()

[4 6 1 3 5 0 2 7 8 9]


GROUP
4    11955
3     8897
5     6455
1     4034
2      604
6      546
0      538
7       87
8        9
9        1
Name: count, dtype: int64

In [4]:
df = merge_groups(df)
df["GROUP"].unique()
df["GROUP"].value_counts()

GROUP
4    11955
3     8897
5     6455
2     4638
6      643
1      538
Name: count, dtype: int64

In [5]:
df["TARGET"].value_counts()

TARGET
0    32542
1      584
Name: count, dtype: int64

In [6]:
grouped_train_df = df.assign(COUNT=1).groupby(["GROUP", "TARGET"], as_index=False)["COUNT"].sum().sort_values(["GROUP", "TARGET"])
grouped_train_df = grouped_train_df.pivot(index="GROUP", columns="TARGET", values="COUNT").reset_index().fillna(0)
grouped_train_df.columns = ["GROUP", "NEGATIVE", "POSITIVE"]
grouped_train_df['RATIO'] = grouped_train_df['POSITIVE'] / (grouped_train_df['POSITIVE'] + grouped_train_df['NEGATIVE'])
grouped_train_df

Unnamed: 0,GROUP,NEGATIVE,POSITIVE,RATIO
0,1,529,9,0.016729
1,2,4612,26,0.005606
2,3,8858,39,0.004384
3,4,11658,297,0.024843
4,5,6268,187,0.02897
5,6,617,26,0.040435


In [7]:
df = balance_by_target_per_group(df)
df["TARGET"].value_counts()

TARGET
0    32542
1     6424
Name: count, dtype: int64

In [8]:
grouped_train_df = df.assign(COUNT=1).groupby(["GROUP", "TARGET"], as_index=False)["COUNT"].sum().sort_values(["GROUP", "TARGET"])
grouped_train_df = grouped_train_df.pivot(index="GROUP", columns="TARGET", values="COUNT").reset_index().fillna(0)
grouped_train_df.columns = ["GROUP", "NEGATIVE", "POSITIVE"]
grouped_train_df['RATIO'] = grouped_train_df['POSITIVE'] / (grouped_train_df['POSITIVE'] + grouped_train_df['NEGATIVE'])
grouped_train_df

Unnamed: 0,GROUP,NEGATIVE,POSITIVE,RATIO
0,1,529,99,0.157643
1,2,4612,286,0.058391
2,3,8858,429,0.046194
3,4,11658,3267,0.218894
4,5,6268,2057,0.247087
5,6,617,286,0.316722


In [9]:
df = balance_by_group(df)
df["GROUP"].value_counts()

GROUP
4    14925
3    14925
5    14925
2    14925
6    14925
1    14925
Name: count, dtype: int64

In [10]:
grouped_train_df = df.assign(COUNT=1).groupby(["GROUP", "TARGET"], as_index=False)["COUNT"].sum().sort_values(["GROUP", "TARGET"])
grouped_train_df = grouped_train_df.pivot(index="GROUP", columns="TARGET", values="COUNT").reset_index().fillna(0)
grouped_train_df.columns = ["GROUP", "NEGATIVE", "POSITIVE"]
grouped_train_df['RATIO'] = grouped_train_df['POSITIVE'] / (grouped_train_df['POSITIVE'] + grouped_train_df['NEGATIVE'])
grouped_train_df

Unnamed: 0,GROUP,NEGATIVE,POSITIVE,RATIO
0,1,12585,2340,0.156784
1,2,14057,868,0.058157
2,3,14257,668,0.044757
3,4,11658,3267,0.218894
4,5,11188,3737,0.250385
5,6,10215,4710,0.315578


In [11]:
balanced_dataset_path = "../processed_data/train_df_balanced.feather"
df.to_feather(balanced_dataset_path)

In [12]:
df = pd.read_feather("../processed_data/train_df.feather")
df = merge_groups(df)
grouped_train_df = df.assign(COUNT=1).groupby(["GROUP", "TARGET"], as_index=False)["COUNT"].sum().sort_values(["GROUP", "TARGET"])
grouped_train_df = grouped_train_df.pivot(index="GROUP", columns="TARGET", values="COUNT").reset_index().fillna(0)
grouped_train_df.columns = ["GROUP", "NEGATIVE", "POSITIVE"]
grouped_train_df['RATIO'] = grouped_train_df['POSITIVE'] / (grouped_train_df['POSITIVE'] + grouped_train_df['NEGATIVE'])
grouped_train_df

Unnamed: 0,GROUP,NEGATIVE,POSITIVE,RATIO
0,1,529,9,0.016729
1,2,4612,26,0.005606
2,3,8858,39,0.004384
3,4,11658,297,0.024843
4,5,6268,187,0.02897
5,6,617,26,0.040435


In [13]:
df = downsample_by_group(df)
df["GROUP"].value_counts()

GROUP
4    826
5    716
3    568
2    555
6    555
1    538
Name: count, dtype: int64

In [14]:
grouped_train_df = df.assign(COUNT=1).groupby(["GROUP", "TARGET"], as_index=False)["COUNT"].sum().sort_values(["GROUP", "TARGET"])
grouped_train_df = grouped_train_df.pivot(index="GROUP", columns="TARGET", values="COUNT").reset_index().fillna(0)
grouped_train_df.columns = ["GROUP", "NEGATIVE", "POSITIVE"]
grouped_train_df['RATIO'] = grouped_train_df['POSITIVE'] / (grouped_train_df['POSITIVE'] + grouped_train_df['NEGATIVE'])
grouped_train_df

Unnamed: 0,GROUP,NEGATIVE,POSITIVE,RATIO
0,1,529,9,0.016729
1,2,529,26,0.046847
2,3,529,39,0.068662
3,4,529,297,0.359564
4,5,529,187,0.261173
5,6,529,26,0.046847
