In [1]:
import requests

def check_internet_connection():
    try:
        # Send a GET request to a reliable website
        response = requests.get("https://www.google.com", timeout=5)
        
        # Check the response status code
        if response.status_code == 200:
            print("Internet connection is available.")
        else:
            print("Internet connection is not available.")
    
    except requests.ConnectionError:
        print("Internet connection is not available.")
    
    except requests.Timeout:
        print("Request timed out. Internet connection may be slow or unavailable.")

# Call the function to check the internet connection
check_internet_connection()                                                      

Internet connection is available.


In [3]:
import torch
import tiktoken
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import transformers
import tensorflow as tf
from models_1 import GPTModel
from helper_function import *
from load_gpt2 import *

2024-06-24 03:18:41.802936: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-24 03:18:41.937581: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Version Control

In [3]:
from importlib.metadata import version

pkgs = ["matplotlib",
        "numpy",
        "tiktoken",
        "torch",
        "transformers", # For OpenAI's pretrained weights
        "pandas",      # Dataset loading
        "tensorflow",
       ]
for p in pkgs:
    print(f"{p} version: {version(p)}")

matplotlib version: 3.8.3
numpy version: 1.26.4
tiktoken version: 0.7.0
torch version: 2.3.0+cu118
transformers version: 4.39.0.dev0
pandas version: 2.2.1
tensorflow version: 2.16.1


Check Device

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print cuda name
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")

    # Iterate over the available GPUs and print their names
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("CUDA is not available.")

Number of available GPUs: 4
GPU 0: Tesla V100-SXM2-32GB
GPU 1: Tesla V100-SXM2-32GB
GPU 2: Tesla V100-SXM2-32GB
GPU 3: Tesla V100-SXM2-32GB


Specify tokenizer and padding token

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [8]:
print(tokenizer.encode('Hello world', allowed_special={"<|endoftext|>"}))

[15496, 995]


## Load Pre-trained GPT2 Model

In [5]:
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"

BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [6]:
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


We try the pretrained model without modification

In [7]:
text_1 = "Every effort moves you"

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(text_1, tokenizer),
    max_new_tokens=15,
    context_size=BASE_CONFIG["context_length"]
)

print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


## Data

In [8]:
train_dataset = RatingDataset(
    csv_file="data/train_df.csv",
    max_length=300,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

300


In [9]:
val_dataset = RatingDataset(
    csv_file="data/val_df.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = RatingDataset(
    csv_file="data/test_df.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [10]:
from torch.utils.data import DataLoader

num_workers = 4
batch_size = 16

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)

In [11]:
print("Train loader:")
for input_batch, target_batch in train_loader:
    print(len(input_batch[0]))
    print(token_ids_to_text(input_batch[0], tokenizer))
    print(target_batch[0])
    break

print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions", target_batch.shape)

Train loader:
300
this probably makes me anti-wisconsin... but i simply love this place!  the non-fat chocolate tastes better than the calorie-laden alternatives densely populating the area.  and on a very lucky day, the grapefruit sorbet is the ultimate when fresh from the tap.  hitting that flavor is like winning the lottery!  \n\nalso, the guilt-free menu options make it a great date-night stop if you don't want to hear your SO complaining post-indulgence about all of the weight they probably gained.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext

In [12]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

32500 training batches
8125 validation batches
3125 test batches


### Modify Model

In [13]:
print(model)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

lock all gradients

In [14]:
for param in model.parameters():
    param.requires_grad = False

In [15]:
num_classes = 5
model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)

In [16]:
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True

for param in model.trf_blocks[-2].parameters():
    param.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True

In [17]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs:", inputs)
print("Inputs dimensions:", inputs.shape) # shape: (batch_size, num_tokens)

Inputs: tensor([[5211,  345,  423,  640]])
Inputs dimensions: torch.Size([1, 4])


In [18]:
with torch.no_grad():
    outputs = model(inputs)

print("Outputs:\n", outputs)
print("Outputs dimensions:", outputs.shape) # shape: (batch_size, num_tokens, num_classes)

Outputs:
 tensor([[[-1.3588, -2.0936,  1.8803,  0.0871, -0.3640],
         [-7.8132, -8.9034, 11.5499, -4.9334,  2.9435],
         [-5.5171, -7.2900,  8.5563, -4.9027,  2.7175],
         [-3.3443, -6.5046,  6.4765, -1.9778, -0.4208]]])
Outputs dimensions: torch.Size([1, 4, 5])


In [19]:
print("Last output token:", outputs[:, -1, :])

Last output token: tensor([[-3.3443, -6.5046,  6.4765, -1.9778, -0.4208]])


In [20]:
probas = torch.softmax(outputs[:, -1, :], dim=-1)
label = torch.argmax(probas)
print("Class label:", label.item())

Class label: 2


In [21]:
logits = outputs[:, -1, :]
label = torch.argmax(logits)
print("Class label:", label.item())

Class label: 2


In [26]:
! pip install -e .

Obtaining file:///home/jinran/Review_classfication
[31mERROR: file:///home/jinran/Review_classfication does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes

In [28]:
torch.manual_seed(42) # For reproducibility due to the shuffling in the training data loader

train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=50)
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=50)
test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches=50)

print(f"Training accuracy: {train_accuracy*100:.2f}%") wquyuiyewuweyuiyuiewqyuijghdaKJCH
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 18.62%
Validation accuracy: 19.75%
Test accuracy: 24.62%


In [29]:
with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)
    test_loss = calc_loss_loader(test_loader, model, device, num_batches=5)

print(f"Training loss: {train_loss:.3f}")
print(f"Validation loss: {val_loss:.3f}")
print(f"Test loss: {test_loss:.3f}")

Training loss: 6.956
Validation loss: 6.781
Test loss: 8.158


# We Train The Model!!

In [None]:
import time
from train import *
start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

num_epochs = 1
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=5,
    tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.649, Val loss 0.985
Ep 1 (Step 000050): Train loss 0.712, Val loss 0.996
Ep 1 (Step 000100): Train loss 0.594, Val loss 0.992
Ep 1 (Step 000150): Train loss 0.739, Val loss 0.996
Ep 1 (Step 000200): Train loss 0.783, Val loss 1.005
Ep 1 (Step 000250): Train loss 0.680, Val loss 0.998
Ep 1 (Step 000300): Train loss 0.720, Val loss 1.014
Ep 1 (Step 000350): Train loss 0.853, Val loss 1.026
Ep 1 (Step 000400): Train loss 0.835, Val loss 0.999
Ep 1 (Step 000450): Train loss 0.617, Val loss 1.020
Ep 1 (Step 000500): Train loss 0.841, Val loss 1.026
Ep 1 (Step 000550): Train loss 0.706, Val loss 1.003
Ep 1 (Step 000600): Train loss 0.859, Val loss 1.035
Ep 1 (Step 000650): Train loss 0.765, Val loss 1.029
Ep 1 (Step 000700): Train loss 0.700, Val loss 1.026
Ep 1 (Step 000750): Train loss 0.765, Val loss 1.021
Ep 1 (Step 000800): Train loss 0.855, Val loss 1.017
Ep 1 (Step 000850): Train loss 0.753, Val loss 0.997
Ep 1 (Step 000900): Train loss 0.716, Val loss

## Inference

In [40]:
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=50)
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=50)
test_accuracy = calc_accuracy_loader(test_loader, model, device, num_batches=50)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 68.25%
Validation accuracy: 66.25%
Test accuracy: 69.00%


In [48]:
example = "This is ok"
tokens = text_to_token_ids(example, tokenizer)
tokens = tokens.to(device)
with torch.no_grad():
    logits = model(tokens)[:, -1, :]
predicted_labels = torch.argmax(logits, dim=-1)
print(predicted_labels.item())

1


In [42]:
example = "I hate this film"
def classifier(text):
    tokens = text_to_token_ids(text, tokenizer)
    tokens = tokens.to(device)
    with torch.no_grad():
        logits = model(tokens)[:, -1, :]
    predicted_labels = torch.argmax(logits, dim=-1)
    if predicted_labels.item() == 0:
        print('bad')
    else:
        print('good')

In [53]:
save_model(model, optimizer, '4_class.pth')

model saved to:  4_class.pth


### Confusion Matrix

In [31]:
import torch
from sklearn.metrics import confusion_matrix

# Assuming you have your trained model
model.eval()  # Set the model to evaluation mode

# Initialize empty lists to store true labels and predicted labels
true_labels = []
predicted_labels = []

# Iterate over the test data loader

for text, labels in test_loader:
    # Move the data to the same device as the model (GPU or CPU)
    text = text.to(device)
    labels = labels.to(device)

    # Forward pass through the model
    with torch.no_grad():
        logits = model(text)[:, -1, :]  # Logits of last output token
    predicted = torch.argmax(logits, dim=-1)

    # Append true labels and predicted labels to the lists
    true_labels.extend(labels.cpu().numpy())
    predicted_labels.extend(predicted.cpu().numpy())

# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[7345 2095  447   48   65]
 [2224 4810 2722  186   58]
 [ 439 1473 5921 1984  183]
 [  88  139 2129 5583 2061]
 [  78   30  376 3076 6440]]


small model with last two moving gradient results in 0.98