<a href="https://colab.research.google.com/github/jtlai0921/-/blob/master/imbalanced_data_class_weight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from glob import glob
import numpy as np
import os
import cv2
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
IMG_SIZE = 128

class_names = ['normal', 'bacteria', 'virus']
cls_map = {cls: i for i, cls in enumerate(class_names)}
print(cls_map)

{'normal': 0, 'bacteria': 1, 'virus': 2}


In [5]:
def read_paths(folder_path):
    file_paths = glob(os.path.join(folder_path, '*'))
    classes = []
    for path in file_paths:
        filename = path.split('/')[-1].split('.')[0]
        cls_name = filename.split('_')[-1]
        cls = cls_map[cls_name]
        classes.append(cls)
    df = pd.DataFrame()
    df['path'] = file_paths
    df['cls'] = classes
    return df
df_train = read_paths('/content/drive/MyDrive/imbalanced data/pneumonia/train')
df_val = read_paths('/content/drive/MyDrive/imbalanced data/pneumonia/val')

In [6]:
print(df_train.cls.value_counts(), '\n', df_val.cls.value_counts())

0    100
1     33
2     17
Name: cls, dtype: int64 
 0    50
1    28
2    22
Name: cls, dtype: int64


In [7]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, df, bs, img_size, shuffle=True):
        self.df = df.copy()
        self.bs = bs
        self.img_size = img_size
        self.shuffle = shuffle
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.ceil(len(self.df)/self.bs))
    
    def __getitem__(self, index):
        batch_idxs = self.indexes[index*self.bs: (index+1)*self.bs]
        x = np.empty((len(batch_idxs), self.img_size, self.img_size, 3))
        y = np.empty((len(batch_idxs)))
        for i, df_index in enumerate(batch_idxs):
            row = self.df.iloc[df_index, :]
            path = row.path
            label = row.cls
            # img
            img = cv2.imread(path) / 255.
            img = cv2.resize(img, (self.img_size, self.img_size))
            x[i] = img
            # label
            y[i] = label
        y = keras.utils.to_categorical(y, num_classes=len(class_names))
        return x, y
                   
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.df))
        if self.shuffle: np.random.shuffle(self.indexes)

In [8]:
gen_train = DataGenerator(df_train, 8, IMG_SIZE)
gen_val = DataGenerator(df_val, 8, IMG_SIZE)

In [10]:
model = keras.models.Sequential([
    keras.layers.Input((IMG_SIZE, IMG_SIZE, 3)),
    keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
    keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
    keras.layers.MaxPooling2D(),
    keras.layers.Conv2D(32, 3, activation='relu', padding='same'),
    keras.layers.Conv2D(32, 3, activation='relu', padding='same'),
    keras.layers.MaxPooling2D(),
    keras.layers.Conv2D(64, 3, activation='relu', padding='same'),
    keras.layers.Conv2D(64, 3, activation='relu', padding='same'),
    keras.layers.MaxPooling2D(),
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(3, activation='softmax'),]
)

In [11]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 128, 128, 16)      448       
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 128, 128, 16)      2320      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 64, 64, 16)        0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 64, 64, 32)        4640      
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 64, 64, 32)        9248      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 32, 32, 64)       

In [12]:
model.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.categorical_crossentropy,
              metrics=[keras.metrics.categorical_accuracy])

In [13]:
# Class Weight Calculation
data_count = np.unique(df_train.cls, return_counts=True)[1]
print('data_count: ', data_count)
weights = (1/data_count)*np.sum(data_count)/3
class_weight = {i: w for i, w in enumerate(weights)}
print('class_weight', class_weight)

data_count:  [100  33  17]
class_weight {0: 0.5, 1: 1.5151515151515154, 2: 2.9411764705882355}


In [14]:
model.fit(gen_train, 
          epochs=10000,
          validation_data=gen_val,
          callbacks=[keras.callbacks.EarlyStopping(patience=10),
                     keras.callbacks.ModelCheckpoint('cls_weight_model.h5', save_best_only=True)],
          class_weight=class_weight # Add class weight
          )

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000


<tensorflow.python.keras.callbacks.History at 0x7fb06039cf60>

In [15]:
def cls_report(path):
    model = keras.models.load_model(path)
    y_pred = np.empty((0, 3))
    y_val = np.empty((0, 3))
    for x,y in gen_val:
        pred = model.predict(x)
        y_pred = np.concatenate((y_pred, pred))
        y_val = np.concatenate((y_val, y))
    y_pred = np.argmax(y_pred, axis=-1)
    y_val = np.argmax(y_val, axis=-1)

    print(classification_report(y_val, y_pred))
    print(confusion_matrix(y_val, y_pred))

cls_report('cls_weight_model.h5')

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        50
           1       0.54      0.54      0.54        28
           2       0.43      0.41      0.42        22

    accuracy                           0.74       100
   macro avg       0.65      0.65      0.65       100
weighted avg       0.73      0.74      0.74       100

[[50  0  0]
 [ 1 15 12]
 [ 0 13  9]]
