In [13]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.colors as mcolors
from scipy.stats import f_oneway
import re
sns.set()

# Loading Essay Score Dataset & perfom NLP Data Preprocessing

In [14]:
data_path = "data/train.csv"
df = pd.read_csv(data_path)

data_path = "data/test.csv"
df_test = pd.read_csv(data_path)

In [15]:
X = df.drop(columns=["score","essay_id"])
y = df["score"]
test = df_test.drop(columns=["essay_id"])


## NLP Preprocessing

In [16]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)


cList = {#"dont" : "do not", "doesnt" : "does not", "thats" : "that is"
    "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have",
    "couldn't": "could not", "couldn't've": "could not have", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
    "he'd": "he would",  ## --> he had or he would
    "he'd've": "he would have","he'll": "he will", "he'll've": "he will have", "he's": "he is", 
    "how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would",   ## --> I had or I would
    "I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had",   ## --> It had or It would
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have",
    "must've": "must have","mustn't": "must not","mustn't've": "must not have",
    "needn't": "need not","needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not","oughtn't've": "ought not have",
    "shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have",
    "she'd": "she would",   ## --> It had or It would
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is",
    "should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have","that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have","there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","weren't": "were not",
    "we'd": "we had",
    "we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
    "what'll": "what will","what'll've": "what will have","what're": "what are","what's": "what is","what've": "what have",
    "when's": "when is","when've": "when have",
    "where'd": "where did","where's": "where is","where've": "where have",
    "who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is","why've": "why have",
    "will've": "will have","won't": "will not","won't've": "will not have",
    "would've": "would have","wouldn't": "would not","wouldn't've": "would not have",
    "y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
    "y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you you will","you'll've": "you you will have",
    "you're": "you are",  "you've": "you have"
}
c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

def dataPreprocessing(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    #x = re.sub("'\d+", '',x)
    #x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Remove \xa0
    x = x.replace(u'\xa0',' ')
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    x = expandContractions(x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
#     x = re.sub(r'[^\w\s.,;:""''?!]', '', x)
  #replace \'s with 's
    #print(re.findall("\\'s", x))
    #x = re.sub(r"\[\]'s", "'s", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [17]:
X["full_text"] = X.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)
test["full_text"] = test.apply(lambda x: dataPreprocessing(x["full_text"]),axis=1)


In [18]:
len(X)

17307

## Optional: Test various Tokenizers

This section presents an analysis of state-of-the-art tokenizers. The objective is to find a tokenizer which has affordable computational cost and produces manageable seq_len essays.

In [112]:
exp_config.original_checkpoint_path

'/kaggle/input/llama-3/pytorch/8b/1'

In [2]:
from llama.tokenizer import  Tokenizer


ModuleNotFoundError: No module named 'llama'

In [7]:
#Hugghing face Autotokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("C:/Users/kevin/.cache/kagglehub/models/metaresearch/llama-3/pyTorch/8b/1")



tiktoken_essay_autotoken = pd.Series(
    X["full_text"].apply(lambda x: tokenizer.encode(x))
)

OSError: C:/Users/kevin/.cache/kagglehub/models/metaresearch/llama-3/pyTorch/8b/1 does not appear to have a file named config.json. Checkout 'https://huggingface.co/C:/Users/kevin/.cache/kagglehub/models/metaresearch/llama-3/pyTorch/8b/1/tree/None' for available files.

In [99]:
#Tiktokenizer
import tiktoken
encoder = tiktoken.encoding_for_model("gpt-4o")
text = "Hi my name is Kevin. What is yours?"


tiktoken_essay = pd.Series(
    X["full_text"].apply(lambda x: encoder.encode(x))
)

In [104]:
tiktoken_essay.apply(lambda x : len(x)).max()

1812

In [94]:
decodes

'Hi my name is Kevin. What is yours?'

In [92]:
tokens_integer

[12194, 922, 1308, 382, 29729, 13, 4614, 382, 24648, 30]

## Create Train-Validation Pandas Dataset Split

In [46]:

seed = 10
generator = np.random.RandomState(seed)
df_size = len(X)
train_proportion = 0.8
validation_proportion = 0.2
train_size = int(df_size * train_proportion)
validation_size = df_size - train_size 
arr_train_idxs = generator.choice(np.arange(1,len(X)),size=[train_size,],replace=False)

train = X.iloc[arr_train_idxs].reset_index(drop=True)
train_labels = y.iloc[arr_train_idxs].reset_index(drop=True)
validation = X[~X.index.isin(arr_train_idxs)].reset_index(drop=True)
validation_labels = y.iloc[~y.index.isin(arr_train_idxs)].reset_index(drop=True)

# Fine Tuning Llama-3 using PERFT QLORA

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm


==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 4060 Laptop GPU. Max memory: 7.996 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Unsloth: unsloth/llama-3-8b-bnb-4bit has no tokenizer.model file.
Just informing you about this - this is not a critical error.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Pytorch Datasets & Dataloaders

In [29]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [30]:
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

In [50]:
class EssayScoringDataset(Dataset):
    def __init__(self, df,y, tokenizer, seq_len):
        super().__init__()
        self.df = df
        self.y = y
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.bos_token = self.tokenizer.bos_token_id
        self.eos_token = self.tokenizer.eos_token_id
        self.pad_token = self.tokenizer.pad_token_id

    def __getitem__(self,idx):
        essay = self.df.iloc[idx]["full_text"]
        score = self.y[idx]

        #tokenize essay text into tokens
        dec_input_tokens = self.tokenizer.encode(essay)

        # Add sos, eos and padding to each sentence
        #enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> 
        #dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 2

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        #if  dec_num_padding_tokens < 0:
        #    raise ValueError("Sentence is too long")

        # Add only <s> token
        decoder_input = torch.cat(
                [
                    torch.tensor(dec_input_tokens, dtype=torch.int64),
                    torch.tensor([self.eos_token],dtype=torch.int64)
                ],
                dim=0,
        )

        #assert decoder_input.size(0) == self.seq_len

        decoder_mask = torch.cat(
                [
                    (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
                    torch.tensor([1],dtype=torch.int64)
                ],
                dim=0,
        )



        # label : Add only </s> token
        label = torch.tensor([score],dtype=torch.int64)

        
        return {
            "decoder_input" : decoder_input,
            "decoder_mask" : decoder_mask,
            "label" : label
        }

    def __len__(self):
        return len(self.df)

In [51]:
train_pytorch_dataset = EssayScoringDataset(train,train_labels,tokenizer,max_seq_length)
validation_pytorch_dataset = EssayScoringDataset(validation,validation_labels,tokenizer,max_seq_length)

In [52]:
train_pytorch_dataset[0]


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



RuntimeError: Tensors must have same number of dimensions: got 3 and 1

In [24]:
enc = tokenizer.encode("this is a text sample")
enc

[128000, 576, 374, 264, 1495, 6205]

In [4]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

# Adapt Essay Scoring DataFrame to be compatible with Pytorch

In [None]:
%pip install transformers

In [None]:
from dataclasses import dataclass
from torch.utils.data import DataLoader, Dataset
from typing import Optional, Union, Any
from transformers import DataCollatorWithPadding


from transformers import AutoTokenizer

def define_tokenizer(cfg):
    """
    Let's use basic AutoTokenizer
    """

    tokenizer = AutoTokenizer.from_pretrained(cfg.architecture["backbone"], trust_remote_code=True)

    # Make sure that we have a pad token and and eos token that will be used for pooling
    if tokenizer.pad_token is None:
        print("Setting new pad token")
        # pad token is missig        
        tokenizer.pad_token="<|reserved_special_token_0|>"
        
    if tokenizer.eos_token is None:
        print("Setting new eos_token token")
        # eos_token token is missig
        tokenizer.eos_token="<|reserved_special_token_1|>"
    
    # Make sure that padding is always "right"
    if tokenizer.padding_side != "right":
        print(f"Changing padding side from {tokenizer.padding_side} to 'right'")
        tokenizer.padding_side = "right"
    return tokenizer