# Custom Dataset

## In this tutorial, we provide an example of adapting usb to custom dataset.

In [None]:
!pip install semilearn==0.3.1
!nvidia-smi

In [None]:
import numpy as np
from torchvision import transforms
from semilearn import get_data_loader, get_net_builder, get_algorithm, get_config, Trainer
from semilearn import split_ssl_data, BasicDataset

## Specifiy configs and define the model

In [None]:


config = {
    'algorithm': 'flexmatch',
    'net': 'vit_tiny_patch2_32',
    'use_pretrain': True,
    'pretrain_path': 'https://github.com/microsoft/Semi-supervised-learning/releases/download/v.0.0.0/vit_tiny_patch2_32_mlp_im_1k_32.pth',

    # optimization configs
    'epoch': 1,
    'num_train_iter': 5000,
    'num_eval_iter': 500,
    'num_log_iter': 50,
    'optim': 'AdamW',
    'lr': 5e-4,
    'layer_decay': 0.5,
    'batch_size': 16,
    'eval_batch_size': 16,


    # dataset configs
    'dataset': 'cifar10',
    'num_labels': 450,
    'num_classes': 2,
    'img_size': 32,
    'crop_ratio': 0.875,
    'data_dir': './data',


    # algorithm specific configs
    'hard_label': True,
    'uratio': 2,
    'ulb_loss_ratio': 1.0,

    # device configs
    'gpu': 0,
    'world_size': 1,
    'distributed': False,
    "num_workers": 2,
}
config = get_config(config)



In [None]:
# create model and specify algorithm
algorithm = get_algorithm(config,  get_net_builder(config.net, from_name=False), tb_log=None, logger=None)

## Create dataset

In [None]:
import json
import pandas as pd

# Read the JSON file into a dictionary
with open("training_data.json", "r") as f:
    data = json.load(f)

# Convert this dictionary into a pandas DataFrame
df = pd.DataFrame(data)

In [None]:
# Convert list to np array of shape (10000, 32, 32, 3)
df["image"] = df["image"].apply(lambda x: np.array(x).reshape((-1, 32, 32, 3)))

# Get all of the values in the image column as a single np array of shape 10000,32,32,3
images_array = np.squeeze(np.stack(df["image"].values),axis=1)

In [None]:
# Set labels and check shape
target = df["stables"].values
type(target)

In [None]:
# Split into training and eval sets
from sklearn.model_selection import train_test_split
train_images, eval_images, train_labels, eval_labels = train_test_split(images_array, target, test_size=0.2, random_state=42)

In [None]:
# Add into a custom training dataset
lb_data, lb_target, ulb_data, ulb_target = split_ssl_data(config, np.uint8(train_images), train_labels, 2,
                                                          config.num_labels, include_lb_to_ulb=config.include_lb_to_ulb)

train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                      transforms.RandomCrop(32, padding=int(32 * 0.125), padding_mode='reflect'),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

train_strong_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                             transforms.RandomCrop(32, padding=int(32 * 0.125), padding_mode='reflect'),
                                             transforms.ToTensor(),
                                             transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

lb_dataset = BasicDataset(config.algorithm, lb_data, lb_target, config.num_classes, train_transform, is_ulb=False)
ulb_dataset = BasicDataset(config.algorithm, lb_data, lb_target, config.num_classes, train_transform, is_ulb=True, strong_transform=train_strong_transform)

In [None]:
# Define eval dataset
eval_transform = transforms.Compose([transforms.Resize(32),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

eval_dataset = BasicDataset(config.algorithm, np.uint8(eval_images), eval_labels, config.num_classes, eval_transform, is_ulb=False)

In [None]:
# define data loaders
train_lb_loader = get_data_loader(config, lb_dataset, config.batch_size)
train_ulb_loader = get_data_loader(config, ulb_dataset, int(config.batch_size * config.uratio))
eval_loader = get_data_loader(config, eval_dataset, config.eval_batch_size)

## Training and evaluation

In [None]:
# training and evaluation
trainer = Trainer(config, algorithm)
trainer.fit(train_lb_loader, train_ulb_loader, eval_loader)
trainer.evaluate(eval_loader)

In [None]:
  trainer