In [1]:
import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, confusion_matrix, accuracy_score, f1_score
import os
import re
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.model_selection import train_test_split

2024-07-11 19:24:00.523653: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-11 19:24:01.079983: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(


In [51]:
torch.cuda.set_device(0)  # Now this will refer to the third GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print cuda name
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")

    # Iterate over the available GPUs and print their names
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("CUDA is not available.")

Number of available GPUs: 4
GPU 0: Tesla V100-SXM2-32GB
GPU 1: Tesla V100-SXM2-32GB
GPU 2: Tesla V100-SXM2-32GB
GPU 3: Tesla V100-SXM2-32GB


In [52]:
print(f"Current CUDA device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

Current CUDA device: 0
Device name: Tesla V100-SXM2-32GB


In [3]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [4]:
# helper
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

class RatingDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=None, pad_token_id=50256):
        self.data = df

        # Pre-tokenize texts
        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["cleaned_text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length
            # Truncate sequences if they are longer than max_length
            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        # Pad sequences to the longest sequence
        self.encoded_texts = [
            encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["target"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length
    
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    outputs = model(input_batch)
    logits = outputs.logits[:, -1, :]  # Logits of last output token
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)

            with torch.no_grad():
                outputs = model(input_batch)
                logits = outputs.logits[:, -1, :]  # Logits of last output token
            predicted_labels = torch.argmax(logits, dim=-1)

            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


def plot_values(epochs_seen, examples_seen, train_values, val_values, label="loss"):
    fig, ax1 = plt.subplots(figsize=(5, 3))
 

    ax1.plot(epochs_seen, train_values, label=f"Training {label}")
    ax1.plot(epochs_seen, val_values, linestyle="-.", label=f"Validation {label}")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel(label.capitalize())
    ax1.legend()
 

    ax2 = ax1.twiny()
    ax2.plot(examples_seen, train_values, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Examples seen")
 
    fig.tight_layout()
    plt.show()



In [5]:
train = pd.read_csv('data/train.csv', usecols = ['text','target'])

print(train.head())

                                                text  target
0  Our Deeds are the Reason of this #earthquake M...       1
1             Forest fire near La Ronge Sask. Canada       1
2  All residents asked to 'shelter in place' are ...       1
3  13,000 people receive #wildfires evacuation or...       1
4  Just got sent this photo from Ruby #Alaska as ...       1


In [6]:
train.isnull().sum()

text      0
target    0
dtype: int64

In [7]:
train.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^a-zA-Z\s\.\,\!\?\']', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [9]:
train['cleaned_text'] = train['text'].apply(clean_text).tolist()

In [10]:
train.head(7)

Unnamed: 0,text,target,cleaned_text
0,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or...",1,", people receive wildfires evacuation orders i..."
4,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...
5,#RockyFire Update => California Hwy. 20 closed...,1,rockyfire update california hwy. closed in bot...
6,#flood #disaster Heavy rain causes flash flood...,1,flood disaster heavy rain causes flash floodin...


In [11]:
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df' and contains a column 'score'
train_df, val_df = train_test_split(train, test_size=0.2, stratify=train['target'], random_state=42)

In [12]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [13]:
window_size = 55

train_dataset = RatingDataset(
    df=train_df,
    max_length=window_size,
    tokenizer=tokenizer
)
val_dataset = RatingDataset(
    df=val_df,
    max_length=train_dataset.max_length,
    tokenizer=tokenizer)
print(train_dataset.max_length)

55


 #### Overall Model Architecture

In [14]:
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [15]:
num_classes = 2
model.lm_head = torch.nn.Linear(in_features=768, out_features=num_classes)

In [16]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=2, bias=True)
)

In [17]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params:,}")

Total trainable parameters: 124,441,346


train.py

In [49]:
# train

# Overall the same as `train_model_simple` in chapter 5
def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                            eval_freq, eval_iter, tokenizer):
    # Initialize lists to track losses and tokens seen
    #model = torch.nn.DataParallel(model)
    model.to(device)
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)
            optimizer.zero_grad() # Reset loss gradients from previous epoch
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            examples_seen += input_batch.shape[0] # New: track examples instead of tokens
            global_step += 1

        print('finished 1 epoch')

    return train_losses, val_losses, train_accs, val_accs, examples_seen
        

# Same as chapter 5
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.module.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [40]:
def accuracy_confusion_matrix(model, device, data_loader):
    # Assuming you have your trained model
    model.eval()  # Set the model to evaluation mode
    
    # Initialize empty lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    
    # Iterate over the test data loader
    
    for text, labels in data_loader:
        # Move the data to the same device as the model (GPU or CPU)
        text = text.to(device)
        labels = labels.to(device)
    
        # Forward pass through the model
        with torch.no_grad():
            outputs = model(text)
            logits = outputs.logits[:, -1, :]  # Logits of last output token
        predicted = torch.argmax(logits, dim=-1)
    
        # Append true labels and predicted labels to the lists
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

        
    # print accuracy
    acc = accuracy_score(true_labels, predicted_labels)
    print(f'Accuracy: {acc:.4f}')
    # Create the confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    # Print the confusion matrix
    print("Confusion Matrix:")
    print(cm)
    
    print('F1')
    print(f'f1: {f1:.4f}')
    return f1, acc


In [41]:
def find_optimal_threshold(model, val_loader, device):
    model.eval()
    all_probs = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Finding optimal threshold"):
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]  # Logits of last output token
            probs = torch.softmax(logits, dim=-1)[:, 1]  # Probability of positive class
            
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    precisions, recalls, thresholds = precision_recall_curve(all_labels, all_probs)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    optimal_idx = f1_scores[:-1].argmax()  # Exclude the last element as it's undefined
    optimal_threshold = thresholds[optimal_idx]
    
    return optimal_threshold

def create_submission_df(model, test_df, tokenizer, device, val_loader, batch_size=32):
    model.eval()
    predictions = []
    
    # Find optimal threshold
    optimal_threshold = find_optimal_threshold(model, val_loader, device)
    print(f"Optimal threshold: {optimal_threshold:.4f}")
    
    # Create a DataLoader for the test data
    test_dataset = RatingDataset(test_df, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting targets"):
            input_ids, _ = batch  # Unpack the batch
            input_ids = input_ids.to(device)
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]  # Logits of last output token
            probs = torch.softmax(logits, dim=-1)[:, 1]  # Probability of positive class
            predicted = (probs > optimal_threshold).int()
            predictions.extend(predicted.cpu().numpy())
    
    # Create the submission dataframe
    submission_df = pd.DataFrame({
        'id': test_df['id'],
        'target': predictions
    })
    
    return submission_df

TODO：
optimize submission batch size, perhaps no difference at all \
try stacking models \
try different number of epochs \
try different LR \
try different weight decay \
Maybe make a grid search or cross validation

## GRID

In [42]:
from tqdm import tqdm
torch.manual_seed(42)

<torch._C.Generator at 0x7fe03c18cf10>

Efficient Method

In [43]:
torch.__version__

'2.3.0+cu118'

In [None]:
num_workers = 4
results = []
#Grid
param_grid = {
    'learning_rate': [1e-5],
    'batch_size': [16, 32],
    'weight_decay': [0.01, 0.05, 0.1, 0.15, 0.2],
    'epochs': [4, 6, 8]
}
count = 1
for lr in tqdm(param_grid['learning_rate'], desc="Learning rates"):
    for batch_size in tqdm(param_grid['batch_size'], desc="Batch sizes", leave=False):
        train_loader = DataLoader(
                dataset=train_dataset,
                batch_size=batch_size,
                shuffle=True,
                num_workers=num_workers,
                drop_last=True,
            )

        val_loader = DataLoader(
                dataset=val_dataset,
                batch_size=batch_size,
                num_workers=num_workers,
                drop_last=False,
            )
        for weight_decay in tqdm(param_grid['weight_decay'], desc="Weight decays", leave=False):
            # Initialize model and optimizer
            model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
            model.lm_head = torch.nn.Linear(in_features=768, out_features=2)
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)



            for epoch in range(1, max(param_grid['epochs']) + 1):
                # Train for one epoch
                train_classifier_simple(model, train_loader, val_loader, optimizer, device, 
                                            num_epochs=1, eval_freq=len(train_loader), eval_iter=None, 
                                            tokenizer=None)  # Assuming tokenizer is not needed here
                
                # If this epoch is in param_grid['epochs'], print the results
                if epoch in param_grid['epochs']:
                    # Evaluate
                    val_f1, val_accuracy = accuracy_confusion_matrix(model, device, val_loader)
                    results.append({
                    'learning_rate': lr,
                    'batch_size': batch_size,
                    'weight_decay': weight_decay,
                    'epochs': epoch,
                    'val_accuracy': val_accuracy,
                    'val_f1_score': val_f1
                })
                    print(f"LR: {lr}, Batch Size: {batch_size}, Weight Decay: {weight_decay}, "
                          f"Epochs: {epoch}, Accuracy: {val_accuracy:.4f}, F1: {val_f1:.4f}")
                    print(f'finished run {count} ============================')
                    count += 1

Learning rates:   0%|                                                                             | 0/1 [00:00<?, ?it/s]
Batch sizes:   0%|                                                                                | 0/2 [00:00<?, ?it/s][A

Weight decays:   0%|                                                                              | 0/5 [00:00<?, ?it/s][A[A

finished 1 epoch
finished 1 epoch
finished 1 epoch
finished 1 epoch
Accuracy: 0.8418
Confusion Matrix:
[[788  81]
 [160 494]]
F1
f1: 0.8039
LR: 1e-05, Batch Size: 16, Weight Decay: 0.01, Epochs: 4, Accuracy: 0.8418, F1: 0.8039
finished 1 epoch
finished 1 epoch
Accuracy: 0.7840
Confusion Matrix:
[[653 216]
 [113 541]]
F1
f1: 0.7668
LR: 1e-05, Batch Size: 16, Weight Decay: 0.01, Epochs: 6, Accuracy: 0.7840, F1: 0.7668
finished 1 epoch
finished 1 epoch




Weight decays:  20%|█████████████▊                                                       | 1/5 [08:05<32:21, 485.32s/it][A[A

Accuracy: 0.8050
Confusion Matrix:
[[703 166]
 [131 523]]
F1
f1: 0.7789
LR: 1e-05, Batch Size: 16, Weight Decay: 0.01, Epochs: 8, Accuracy: 0.8050, F1: 0.7789
finished 1 epoch
finished 1 epoch
finished 1 epoch
finished 1 epoch
Accuracy: 0.8181
Confusion Matrix:
[[709 160]
 [117 537]]
F1
f1: 0.7950
LR: 1e-05, Batch Size: 16, Weight Decay: 0.05, Epochs: 4, Accuracy: 0.8181, F1: 0.7950
finished 1 epoch
finished 1 epoch
Accuracy: 0.8070
Confusion Matrix:
[[691 178]
 [116 538]]
F1
f1: 0.7854
LR: 1e-05, Batch Size: 16, Weight Decay: 0.05, Epochs: 6, Accuracy: 0.8070, F1: 0.7854
finished 1 epoch
finished 1 epoch




Weight decays:  40%|███████████████████████████▌                                         | 2/5 [16:09<24:13, 484.63s/it][A[A

Accuracy: 0.8050
Confusion Matrix:
[[690 179]
 [118 536]]
F1
f1: 0.7831
LR: 1e-05, Batch Size: 16, Weight Decay: 0.05, Epochs: 8, Accuracy: 0.8050, F1: 0.7831
finished 1 epoch
finished 1 epoch
finished 1 epoch
finished 1 epoch
Accuracy: 0.7787
Confusion Matrix:
[[620 249]
 [ 88 566]]
F1
f1: 0.7706
LR: 1e-05, Batch Size: 16, Weight Decay: 0.1, Epochs: 4, Accuracy: 0.7787, F1: 0.7706
finished 1 epoch
finished 1 epoch
Accuracy: 0.8155
Confusion Matrix:
[[712 157]
 [124 530]]
F1
f1: 0.7905
LR: 1e-05, Batch Size: 16, Weight Decay: 0.1, Epochs: 6, Accuracy: 0.8155, F1: 0.7905
finished 1 epoch
finished 1 epoch




Weight decays:  60%|█████████████████████████████████████████▍                           | 3/5 [24:12<16:08, 484.11s/it][A[A

Accuracy: 0.8293
Confusion Matrix:
[[756 113]
 [147 507]]
F1
f1: 0.7959
LR: 1e-05, Batch Size: 16, Weight Decay: 0.1, Epochs: 8, Accuracy: 0.8293, F1: 0.7959
finished 1 epoch


In [53]:
df_results = pd.DataFrame(results)

# Save results to CSV
df_results.to_csv('grid_search_results_0.csv', index=False)

In [46]:
import itertools
from tqdm import tqdm
count = 0
# Define the hyperparameter ranges
learning_rates = [1e-5, 3e-5, 5e-5, 7e-5]
epochs_list = [4, 6, 8]
batch_sizes = [8, 16, 32]
weight_decays = [0.01, 0.05, 0.1, 0.15, 0.2]

# Create all combinations
combinations = list(itertools.product(learning_rates, epochs_list, batch_sizes, weight_decays))

# Initialize results list
results = []
num_classes = 2
num_workers = 4
# Perform grid search
for lr, epochs, batch_size, weight_decay in tqdm(combinations, desc="Grid Search Progress"):
    count += 1
    print('run', count, '==================')
    print('lr', lr)
    print('epoch', epochs)
    print('batch_size', batch_size)
    print('weight_decay', weight_decay)
    
    # we hvae ran the first 36 combos
    if count <= 53:
        continue
    
    # Initialize model, optimizer, and data loaders with current hyperparameters
    model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
    model.lm_head = torch.nn.Linear(in_features=768, out_features=num_classes)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        drop_last=True,
    )
    
    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        drop_last=False,
    )
    
    # Train the model
    train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
        model, train_loader, val_loader, optimizer, device,
        num_epochs=epochs, eval_freq=25, eval_iter=5,
        tokenizer=tokenizer
    )
    
    # Evaluate the model
    val_f1, val_accuracy = accuracy_confusion_matrix(model, device, val_loader)
    
    # Store results
    results.append({
        'Learning Rate': lr,
        'Epochs': epochs,
        'Batch Size': batch_size,
        'Weight Decay': weight_decay,
        'Val Accuracy': val_accuracy,
        'Val F1 Score': val_f1
    })

# Create DataFrame from results
df_results = pd.DataFrame(results)

# Save results to CSV
df_results.to_csv('grid_search_results.csv', index=False)

print("Grid search completed. Results saved to 'grid_search_results.csv'")

Grid Search Progress:   0%|                                                                      | 0/90 [00:00<?, ?it/s]

lr 1e-05
epoch 4
batch_size 8
weight_decay 0.01
lr 1e-05
epoch 4
batch_size 8
weight_decay 0.05
lr 1e-05
epoch 4
batch_size 8
weight_decay 0.1
lr 1e-05
epoch 4
batch_size 8
weight_decay 0.15
lr 1e-05
epoch 4
batch_size 8
weight_decay 0.2
lr 1e-05
epoch 4
batch_size 16
weight_decay 0.01
lr 1e-05
epoch 4
batch_size 16
weight_decay 0.05
lr 1e-05
epoch 4
batch_size 16
weight_decay 0.1
lr 1e-05
epoch 4
batch_size 16
weight_decay 0.15
lr 1e-05
epoch 4
batch_size 16
weight_decay 0.2
lr 1e-05
epoch 4
batch_size 32
weight_decay 0.01
lr 1e-05
epoch 4
batch_size 32
weight_decay 0.05
lr 1e-05
epoch 4
batch_size 32
weight_decay 0.1
lr 1e-05
epoch 4
batch_size 32
weight_decay 0.15
lr 1e-05
epoch 4
batch_size 32
weight_decay 0.2
lr 1e-05
epoch 6
batch_size 8
weight_decay 0.01
lr 1e-05
epoch 6
batch_size 8
weight_decay 0.05
lr 1e-05
epoch 6
batch_size 8
weight_decay 0.1
lr 1e-05
epoch 6
batch_size 8
weight_decay 0.15
lr 1e-05
epoch 6
batch_size 8
weight_decay 0.2
lr 1e-05
epoch 6
batch_size 16
weight_

Grid Search Progress:  59%|███████████████████████████████████▉                         | 53/90 [00:08<00:05,  6.43it/s]


KeyboardInterrupt: 