## Yahoo! Answers Dataset
Testing and manipulation

https://huggingface.co/datasets/yahoo_answers_topics/blob/main/README.md

Original REPO: https://github.com/LC-John/Yahoo-Answers-Topic-Classification-Dataset

THIS CAN ONLY RUN IN GPU!!!

In [None]:

from copy import deepcopy
import os
import random
import argparse
from pathlib import Path
from time import perf_counter
import json
import accelerate
import bitsandbytes
import math
import hashlib
from types import SimpleNamespace
from typing import Optional, Union, Tuple
import warnings

import pynvml
import torch
from torch import nn
from torch import optim
from torch.utils.checkpoint import checkpoint
import torch.nn.functional as F
import torch.multiprocessing as mp
import transformers
from datasets import load_dataset
import pandas as pd

#QLoRA stuff
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

from pytorch_soo import pytorch_soo
from pytorch_soo.pytorch_soo import optim_hfcr_newton

from pytorch_soo.pytorch_soo import quasi_newton

from pytorch_soo.pytorch_soo import nonlinear_conjugate_gradient as nlcg
from pytorch_soo.pytorch_soo.line_search_spec import LineSearchSpec
from pytorch_soo.pytorch_soo.trust_region import BadTrustRegionSpec, TrustRegionSpec

from yahoo_answers_dataset import Yahoo_Answers_Dataset


In [2]:
def load_data(batch_size_train:int, batch_size_test:int, device) -> Tuple:
    """
    Get the data based upon the relevant arguments
    """

    train = pd.read_csv("/rcfs/projects/sml2024/train_clean_news_articles_categorization.csv")

    train_text = train.iloc[:,-1].to_list()

    train_targets = torch.tensor(train.iloc[:,:-1].to_numpy()).to('cpu')
    # global NUM_OUTPUTS
    # NUM_OUTPUTS = train_targets.shape[1]


    tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2", max_length=INPUT_LENGTH)
    
    # tokenizer.padding_side = "left"
    # Define PAD Token = EOS Token
    tokenizer.pad_token = tokenizer.eos_token
    # tokenizer = transformers.GPT2Tokenizer() #What's the difference betgween this and the one above?


    train_tokenizer_output = tokenizer(train_text, return_tensors='pt', padding='max_length',
                                       max_length=INPUT_LENGTH, truncation=True).to('cpu')

    #New!
    train_tokenized_ids = train_tokenizer_output['input_ids']
    train_masks = train_tokenizer_output['attention_mask']

    #Concatenate the tokenized ids and the masks horizontally to decompose later
    train_ids_and_mask = torch.cat(tensors=(train_tokenized_ids, train_masks), dim = 1)

    
    print("The training data has been loaded in", flush = True)


    train = torch.utils.data.TensorDataset(train_ids_and_mask, train_targets)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size_train, shuffle=True)

    ###############################
    ###########Test Data###########
    ###############################

    test_csv = pd.read_csv("/rcfs/projects/sml2024/test_clean_news_articles_categorization.csv")
    test_text = test_csv.iloc[:,-1].to_list()

    test_tokenizer_output = tokenizer(test_text, return_tensors='pt', padding='max_length',
                                       max_length=INPUT_LENGTH, truncation=True)
    # test_tokenized_inputs.to(device)
    test_tokenized_ids = test_tokenizer_output['input_ids']
    test_masks = test_tokenizer_output['attention_mask']

    #Concatenate the tokenized ids and the masks horizontally to decompose later
    test_ids_and_mask = torch.cat(tensors=(test_tokenized_ids, test_masks), dim = 1)

    #One hot encode the targets
    test_targets = torch.tensor(test_csv.iloc[:,:-1].to_numpy())

    test = torch.utils.data.TensorDataset(test_ids_and_mask, test_targets)
    print("The test data has been loaded in", flush = True)


    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size_test, shuffle=False)


    return train_loader, test_loader


In [3]:
INPUT_LENGTH = 256
NUM_OUTPUTS = 8
def get_model_and_loss(device='cpu'):
    """
    Does what it says on the tin.
    """
    compute_dtype = getattr(torch, "float16")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        load_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = transformers.AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path='gpt2',
                                                                            max_length=INPUT_LENGTH,
                                                                            num_labels=NUM_OUTPUTS,
                                                                            quantization_config=bnb_config,
                                                                            device_map = device)
    # model = transformers.AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path="distilbert/distilgpt2", max_length=INPUT_LENGTH, num_labels=NUM_OUTPUTS)
    
    model.config.pad_token_id = model.config.eos_token_id
    model = prepare_model_for_kbit_training(model)
    config = LoraConfig(
        r=32, #Higher = more expressivity. Controls number of parameters used i.e. memory
        lora_alpha=32,
        target_modules=["score", "c_attn", "c_proj", "c_fc", "score.weights"], #target_modules = ["c_attn", "c_proj", "c_fc"]
        bias = "none",
        lora_dropout=0 #FIXME: Check with SOO paper and see if this needs to be set to 0?
        # task_type = "SEQ_CLS"
    )

    # lora_config = LoraConfig(
    #     r=16,
    #     lora_alpha=32,
    #     target_modules=target_modules,
    #     lora_dropout=0.05,
    #     bias="none",
    #     task_type="SEQ_CLS"
    #     )
    model.gradient_checkpointing_enable()
    model = get_peft_model(model, config)
    # print(peft.print_number_of_trainable_model_parameters(model))
    # model.to('cpu')
    model.print_trainable_parameters()

    loss_calc = torch.nn.MSELoss() #Categorical cross entropy doesn't support one hot encoding  MSELoss

    # if args.read_nn:
    #     print("Reading: ", args.read_nn, flush = True)
    #     model.load_state_dict(torch.load(args.read_nn))

    return model, loss_calc


In [8]:
model, loss_calc = get_model_and_loss('cuda')
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)


Unused kwargs: ['load_4bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,743,424 || all params: 129,189,376 || trainable%: 3.671682724127408


In [9]:
train_loader, test_loader = load_data(32, batch_size_test =32, device='cuda')

The training data has been loaded in
The test data has been loaded in


In [10]:

# for data, targets in train_loader:
#     n = data.shape[1]//2
#     optim.zero_grad()
#     inputs, attention = data[:,:n].to('cuda'), data[:,n:].to('cuda')
#     output = model(inputs, attention)
#     probs = torch.nn.functional.softmax(output.logits, dim = 1)
#     loss = lossfn(probs.to('cpu'), targets.to('cpu'))
#     print(f"loss is {loss}")
#     print(f"probs are {probs[0]}")
#     model_class_prediction = torch.argmax(probs, dim = 1)
#     target_index = torch.argmax(targets, dim = 1)

#             #Calculate number correct in this batch
#     batch_correct = torch.sum(model_class_prediction.to('cpu')[0] == target_index.to('cpu')[0]).item()
#     print(model_class_prediction[0])
#     print(target_index[0])
#     print(batch_correct)
#     optim.step()
    

In [11]:
for epoch in range(5):
    # epoch = 2
    """Perform a training epoch using a first order optimizer"""
    model.to('cuda')
    model.train()
    
    linear_probe = False
    if linear_probe:
        #Freeze everything
        for name,param in model.named_parameters():
            param.requires_grad = False
            if name == 'score.weight':
                param.requires_grad = True
            if name == 'score.bias':
                param.requires_grad = True
        print("Using linear probe!")
        
    
    correct = 0
    train_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
    
        #Split data into the actual inputs and the mask
        n = data.shape[1]
        n = n//2
        data, attention_mask =  data[:,:n].to('cuda'), data[:,n:].to('cuda')
        target.to('cuda')
    
        optimizer.zero_grad()
        output = model(input_ids=data, attention_mask=attention_mask)
        softmax_out = torch.nn.functional.softmax(output.logits, dim = 1)
    
        #Next two lines solved a strange device mismatch error.
        softmax_out = softmax_out.to('cuda')
        target = target.to('cuda')
        target = target.to(torch.float32)
    
        loss = loss_calc(softmax_out, target)  #loss_calc is MSE loss
        train_loss += loss.item()
        if math.isnan(train_loss):
            accuracy = float("nan")
            break
        
        #Returns a tensor, where each entry is the index of the largest element in a row. 
        model_class_prediction = torch.argmax(softmax_out, dim = 1)
        target_index = torch.argmax(target, dim = 1)
    
        #Calculate number correct in this batch
        batch_correct = torch.sum(model_class_prediction == target_index).item()
    
        correct += batch_correct
    
        loss.backward()
        optimizer.step()
    
        if batch_idx % 10 == 0:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch,
                    batch_idx * len(data),
                    len(train_loader.dataset),
                    100.0 * batch_idx / len(train_loader),
                    loss.item(),
                )
            )
    print(
        "\nTrain set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)".format(
            train_loss,
            correct,
            len(train_loader.dataset),
            100.0 * correct / len(train_loader.dataset),
        )
    )
    accuracy = 100.0 * correct / len(train_loader.dataset)
    
    



Train set: Avg. loss: 7.6295, Accuracy: 234/1835 (12.75%)

Train set: Avg. loss: 7.3401, Accuracy: 271/1835 (14.77%)

Train set: Avg. loss: 6.9909, Accuracy: 310/1835 (16.89%)

Train set: Avg. loss: 6.8217, Accuracy: 314/1835 (17.11%)

Train set: Avg. loss: 6.6708, Accuracy: 355/1835 (19.35%)


In [None]:
torch.cuda.empty_cache()

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
torch.cuda.memory_summary(abbreviated=True)


In [None]:
# cd existing-project
# git init
# git add dataset.ipynb
# git commit -m "Initial Commit"
# git remote add origin https://stash.pnnl.gov/scm/~meye795/soo.git
# git push -u origin HEAD:main

In [None]:
print(f"Type of the training set is {type(ya.train)}")
print(f"With keys {ya.train.keys()}")
len(ya.train['seq'])

In [None]:
# tokenizer_config = transformers.PretrainedConfig.from_pretrained()INPUT_LENGTH = 100
INPUT_LENGTH = 100
tokenizer = transformers.AutoTokenizer.from_pretrained("openai-community/gpt2", max_length=INPUT_LENGTH)
# tokenizer.padding_side = "left"
# Define PAD Token = EOS Token
tokenizer.pad_token = tokenizer.eos_token
# tokenizer = transformers.GPT2Tokenizer() #What's the difference betgween this and the one above?
model = transformers.AutoModelForSequenceClassification.from_pretrained('openai-community/gpt2', max_length=100)

In [None]:
# tokenizer.tokenize() #whats the difference between tokenizer.encode()??

#TODO: Add tokenizer config. Cutoff, padding size, unknown token.
tokenized_inputs = [tokenizer.encode(x,return_tensors='pt',padding="max_length", max_length=INPUT_LENGTH, truncation = True) for x in ya.train['seq']]
tokenized_inputs = torch.vstack(tensors = tokenized_inputs)

In [None]:
# torch.save(tokenized_inputs, 'tokenized_inputs.pt')
tokenized_inputs = torch.load('tokenized_inputs.pt')
tokenized_inputs.to('cpu')

In [None]:
torch.cuda.memr()

In [None]:
BATCH_SIZE = 64
train_dataloader = torch.utils.data.DataLoader(tokenized_inputs, batch_size=BATCH_SIZE, shuffle=True)

## Setting up the optimizer
Based off the paper: https://arxiv.org/pdf/2307.11684.pdf Minibatching Offers Improved Generalization Performance for Second Order Optimizers (ICML 2023)

### READ THIS REPO!!! https://github.com/pnnl/pytorch_soo/blob/master/pytorch_soo/nonlinear_conjugate_gradient.py

In [None]:
OPTIMIZER_NAME = None
if OPTIMIZER_NAME == "SGD":
    optimizer = torch.optim.SGD() 
elif OPTIMIZER_NAME == "FR":
    optimizer = optim.BASIC(
        model.parameters(), method = 'FR',
        line_search = 'Strong_Wolfe', c1 = 1e-4,
        c2 = 0.5, lr = 0.2, max_ls = 25)
elif OPTIMIZER_NAME == "LBFGS":
    optimizer = optimize.BFGS.
    
def closure(lossfn):
    optimizer.zero_grad()
    loss = lossfn(model_output, target)
    loss.backward()
    return loss

loss = closure()
optimizer.step(closure)

In [None]:
test = optimize.lbfgsb()

We need to cut down the number of training samples. It is way too large. Go ahead and remove 30% from each class.

In [None]:
train_df[train_df['label']==10].shape

In [None]:
ten = train_df[train_df['label']==10]
nine = train_df[train_df['label']==9]
eight = train_df[train_df['label']==8]
seven = train_df[train_df['label']==7]
six = train_df[train_df['label']==6]
five = train_df[train_df['label']==5]
four = train_df[train_df['label']==4]
three = train_df[train_df['label']==3]
two = train_df[train_df['label']==2]
one = train_df[train_df['label']==1]

In [None]:
ten = ten.sample(frac = 0.5, replace = False)
nine = nine.sample(frac = 0.5, replace = False)
eight = eight.sample(frac = 0.5, replace = False)
seven = seven.sample(frac = 0.5, replace = False)
six = six.sample(frac = 0.5, replace = False)
five = five.sample(frac = 0.5, replace = False)
four = four.sample(frac = 0.5, replace = False)
three = three.sample(frac = 0.5, replace = False)
two = two.sample(frac = 0.5, replace = False)
one = one.sample(frac = 0.5, replace = False)

In [None]:
final = pd.concat(objs = [one, two, three, four, five, six, seven, eight, nine, ten], ignore_index=True)

In [None]:
final.head()

In [None]:
train_df.isna().sum()

Drop the NaNs from the train_df then use the subset.

In [None]:
print(f"Number of NaNs in train_df is {train_df.isna().sum()}")
no_nans = train_df.dropna(inplace = False)
print(f"Number of NaNs in no_nans is {no_nans.isna().sum()}")


In [None]:
no_nans

In [None]:
ten = no_nans[no_nans['label']==10]
nine = no_nans[no_nans['label']==9]
eight = no_nans[no_nans['label']==8]
seven = no_nans[no_nans['label']==7]
six = no_nans[no_nans['label']==6]
five = no_nans[no_nans['label']==5]
four = no_nans[no_nans['label']==4]
three = no_nans[no_nans['label']==3]
two = no_nans[no_nans['label']==2]
one = no_nans[no_nans['label']==1]

In [None]:
LIST = [one, two, three, four, five, six, seven, eight, nine, ten]

In [None]:
for entry in LIST:
    print(entry.shape)

In [None]:
ten = ten.sample(n = 6000, replace = False)
nine = nine.sample(n = 6000, replace = False)
eight = eight.sample(n = 6000, replace = False)
seven = seven.sample(n = 6000, replace = False)
six = six.sample(n = 6000, replace = False)
five = five.sample(n = 6000, replace = False)
four = four.sample(n = 6000, replace = False)
three = three.sample(n = 6000, replace = False)
two = two.sample(n = 6000, replace = False)
one = one.sample(n = 6000, replace = False)
LIST = [one, two, three, four, five, six, seven, eight, nine, ten]

In [None]:
for entry in LIST:
    print(entry.shape)

In [None]:
final = pd.concat(objs = LIST, ignore_index=True)
final

In [None]:
final.memory_usage(deep=True).sum()

In [None]:
train_df.memory_usage(deep=True).sum()

In [None]:
final.shape[0]

In [None]:
final.to_csv("Yahoo-Answers-Topic-Classification-Dataset/dataset/yahoo_answers_csv/train_sixty_thousand.csv", index=False,
              header = False)

In [None]:
final2 = pd.read_csv("Yahoo-Answers-Topic-Classification-Dataset/dataset/yahoo_answers_csv/train_sixty_thousand.csv")

In [None]:
final2['label'][0] + 3