<a href="https://colab.research.google.com/github/joliuliu44/SDDAJ-Project/blob/main/Llama2_eBay_Sonia_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch
!pip install accelerate
!pip install bitsandbytes
!pip install datasets==2.13.1
!pip install transformers
!pip install peft
!pip install trl
!pip install scipy
!pip install seqeval
!pip install evaluate



In [2]:
import pandas as pd
import json
import torch
from torch import nn
import transformers
import argparse
import bitsandbytes as bnb
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset,Dataset
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
import evaluate
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import StoppingCriteria, StoppingCriteriaList
from torch.utils.data import DataLoader
import re
#https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/



In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
tagged = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/eBay_Competition/Train_Tagged_Titles.tsv', sep='\t',quoting=3)
tagged

Unnamed: 0,Record Number,Title,Token,Tag
0,1,Supreme Nike SB Dunk High By any Means Red US1...,Supreme,Modell
1,1,Supreme Nike SB Dunk High By any Means Red US1...,Nike,Marke
2,1,Supreme Nike SB Dunk High By any Means Red US1...,SB,Produktlinie
3,1,Supreme Nike SB Dunk High By any Means Red US1...,Dunk,
4,1,Supreme Nike SB Dunk High By any Means Red US1...,High,Schuhschaft-Typ
...,...,...,...,...
55178,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Sportschuhe,Produktart
55179,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Wanderschuh,
55180,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Big,No Tag
55181,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Size,No Tag


In [5]:
tagged = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/eBay_Competition/Train_Tagged_Titles.tsv', sep='\t',quoting=3)
def modified_dataframe(df):
    indices_to_drop = []

    for index, row in df.iterrows():
        if pd.isna(row['Tag']):
            index_x = index-1
            while index_x in indices_to_drop:
                index_x -= 1
            df.at[index_x, 'Token'] += ' ' + row['Token']
            indices_to_drop.append(index)
    df.drop(indices_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

# Assuming 'tagged' is your DataFrame
modified = modified_dataframe(tagged)

In [6]:
modified

Unnamed: 0,Record Number,Title,Token,Tag
0,1,Supreme Nike SB Dunk High By any Means Red US1...,Supreme,Modell
1,1,Supreme Nike SB Dunk High By any Means Red US1...,Nike,Marke
2,1,Supreme Nike SB Dunk High By any Means Red US1...,SB Dunk,Produktlinie
3,1,Supreme Nike SB Dunk High By any Means Red US1...,High,Schuhschaft-Typ
4,1,Supreme Nike SB Dunk High By any Means Red US1...,By any Means,Modell
...,...,...,...,...
44750,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Sneaker,Stil
44751,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Sportschuhe Wanderschuh,Produktart
44752,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Big,No Tag
44753,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Size,No Tag


In [7]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'
    hf_auth = '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'
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        use_auth_token=hf_auth,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_auth, add_eos_token=True,add_bos_token=False)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token ="<s>"

    return model, tokenizer

In [8]:
from datasets import load_dataset

In [9]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}\n{sample['response']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)

    sample["text"] = formatted_prompt

    return sample

In [10]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length
    #return 2048


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
   # dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["text","__index_level_0__"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset
def preprocess_dataset_dolly(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['instruction', 'context','text', 'response', 'category'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [11]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,#quantizing-> qlora
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [12]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=64,  # dimension of the updated matrices (16에서 2로 줄임)
        lora_alpha=16,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [13]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [14]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [15]:
model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [16]:
"""lora_config = LoraConfig.from_pretrained('results/llama2/final_checkpoint')
model = get_peft_model(model, lora_config)"""

"lora_config = LoraConfig.from_pretrained('results/llama2/final_checkpoint')\nmodel = get_peft_model(model, lora_config)"

**Demonstration of the code using Dolly Dataset**

In [17]:
modified

Unnamed: 0,Record Number,Title,Token,Tag
0,1,Supreme Nike SB Dunk High By any Means Red US1...,Supreme,Modell
1,1,Supreme Nike SB Dunk High By any Means Red US1...,Nike,Marke
2,1,Supreme Nike SB Dunk High By any Means Red US1...,SB Dunk,Produktlinie
3,1,Supreme Nike SB Dunk High By any Means Red US1...,High,Schuhschaft-Typ
4,1,Supreme Nike SB Dunk High By any Means Red US1...,By any Means,Modell
...,...,...,...,...
44750,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Sneaker,Stil
44751,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Sportschuhe Wanderschuh,Produktart
44752,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Big,No Tag
44753,5000,Herren Trekking Schuhe Outdoor Sneaker Sportsc...,Size,No Tag


In [18]:
sys_msg =  """Assistant is a expert JSON builder designed to assist with a named entity recognition task.
Assistant is able to respond to the User and use tools using JSON strings that contain "Aspect Name" and "Aspect Value" parameters. Aspect Name parameters can have the following:
Abteilung, Aktivität, Akzente, Anlass, Besonderheiten, Charakter, Charakter Familie, Dämpfungsgrad, Erscheinungsjahr, EU-Schuhgröße, Farbe, Futtermaterial, Gewebeart, Herstellernummer, Herstellungsland und -region, Innensohlenmaterial, Jahreszeit, Laufsohlenmaterial, Marke, Maßeinheit, Modell, Muster, Obermaterial, Produktart, Produktlinie, Schuhschaft-Typ, Schuhweite, Stil, Stollentyp, Thema, UK-Schuhgröße, US-Schuhgröße, Verschluss, and Zwischensohlen-Typ are the aspect names that it can have. In addition, it can have two other tags: "No Tag" and "Obscure"; however, this should not be shown in the response as it does not contain any meaningful data.
Following are descriptions of the aspect names and examples:
Abteilung (Department)
Gender and/or age grouping characterized by the category.
Examples: DAMEN, Damen, Damenschuhe, HERREN, Herren, Herren Damen, Herrenschuhe, Unisex, W, M
Aktivität (Performance/Activity )
Type of activity the product is best suited for.
Examples: Basketball, FITNESS, Fitness, Laufen, Running, Skate, Skater, Tennis, Trail, Trekking
Akzente (Accents)
Attributes that give the product its distinctive look.
Examples: Cut Out, Glitter, Glitzer, Logo, Nieten, Pailletten, Print, Prints, Spitze, Strass
Anlass (Occasion)
Occasion or celebration the product is affiliated with.
Examples: CASUAL, Freizeit, Freizeit Sport, Gym, Outdoor, Sport, Sport Freizeit, Sportliche
Besonderheiten (Features)
Secondary attributes or functions that are not essential to the product’s main function.
Examples: Atmungsaktiv, gefüttert, Leicht, Profil-Sohle, Profilsohle, Ultraleicht, Warm Gefütterte
Charakter (Character)
Recognized character that the product has on itself or on the packaging. Examples: Beauty And The Beast, Bruce Lee, Han Solo, Hello Kitty, Pepe Le Pew
Charakter Familie (Character Family)
Recognized character family that the product has on itself or on the packaging. Examples: DISNEY, Looney Tunes, Pokemon, Sanrio, STAR WARS, Star Wars
Dämpfungsgrad (Cushioning Level)
Level of shock absorption of the product.
Examples: Airsoft, Barefoot, Barfuß, cushion, Luftpolster, Luftpolster Airsoft, Luftpolstersohle
Erscheinungsjahr (Release Year)
Year the product was released by the manufacturer. Examples: 2020, 2018, 2007
EU-Schuhgröße (EU Shoe Size)
Size of the shoes, using European standard sizes. Examples: 36, 38, 39, 40, 41, 42
Farbe (Color)
Main color of the product itself and other prominent colors. This doesn't include the product's packaging.
Examples: beige, Black, black, Blau, Braun, braun, Grau, Grün, Neon
Futtermaterial (Lining Material)
Main material of the product's lining.
Examples: Fell, Fleece Fellfutter, Fur, Kunstfell, Kunstpelz, Lammfell, Textilfutter
Gewebeart (Fabric Type)
Type of fabric by construction, not the material constituents or fiber contents. Examples: canvas, Denim, Grob, Lack, Mesh, Netz, Optik, Strick
     Herstellernummer (Style Code)
Style Code (may also be called "MPN" or "Manufacturer Part Number") is a product identifier given by the manufacturer, can be the same as the model number or part number. Characterized by a combination of numbers, letters, and/or symbols.
Examples: 1339-14, 365208, CT8527-114, M7652C, ML574EAG, V94M
Herstellungsland und -region (Country/Region of Manufacture)
Geographic location where the product is manufactured.
Examples: Germany, DE, West Germany, DDR, Italien, Italy, Portugal, England, USA
Innensohlenmaterial (Insole Material)
Main material of the product's insole.
Examples: EVA-Sohle, Gel, Laufsohle, Lederfußbett, Luftkissen, Memory, Memosoft, OrthoLite
Jahreszeit (Season)
Time of year the product is intended to be worn, characterized by season name. Examples: Herbst, Herbst Winter, Sommer, Summer, Winter, Winterschuhe
Laufsohlenmaterial (Outsole Material)
Main material of the product's outer sole.
Examples: Cupsole, Gummi, Gummisohl, Gummisohle, Gummschalensohle
Marke (Brand)
Name of the brand, designer, or artist that produces the product. This may be the same or different from the manufacturer.
Examples: Adidas, adidas, Asics, Converse, New Balance, NIKE, Nike, Puma
Maßeinheit
(Unit of Measure)
Units of measure, such as a length in inches/cm, a weight (pounds, grams, kg, etc.), or other measurements. Note that the German language uses a comma "," where English uses a decimal point ".", and conversely uses a period "." instead of a comma "," to separate thousands, for example 1,234.56 in English is 1.234,56 in German.
Examples: 26 cm, 27,5 cm, 28 cm, 28,0 cm
Modell (Model)
Brand or manufacturer’s specific name used for the product.
Examples: 70, Retro, Smash, Smash v2, ST, ZX 8000, 1 Retro, Classic, III, Mexico 66, Plus, Quantum
Muster (Pattern)
Pattern on the product.
Examples: Camo, Camouflage, Graffiti, Leopard, Snake, Zebra
Obermaterial (Upper Material)
Main material of the product's upper component.
Examples: Echtleder, Knit, Leather, Leder, Stoff, Suede, Synthetik
Produktart (Type)
Specific type of product that is being sold in the product listing.
Examples: Sneaker, Freizeitschuhe, Halbschuhe, Laufschuhe, Sportschuh, Sportschuhe, Trainers
Produktlinie (Product Line)
Manufacturer collection or collaboration that the product belongs to.
Examples: Air Force 1, Air Jordan, Air Max, Chuck Taylor All Star, Classic, Flex, Gel, Yeezy

Schuhschaft-Typ (Shoe Shaft Style)
Distinct design appearance of the product characterized by height. Examples: Hi, High, High Top, low, Low Top, Low-Top, Mid
Schuhweite (Shoe Width)
Measured horizontal distance from side to side of the shoe. Often but not always a single capital letter.
Examples: B, D, G, G-Weite, H, K, WIDE
Stil (Style)
Distinct design appearance (shape) of the product.
Examples: Ballerina, Ballerinas, Keilabsatz, Sneaker, Sneakers
Stollentyp (Cleat Type)
Type of cleats.
Examples: Spikes, Nokken, Schraubstollen
Thema (Theme)
Type of visual style or design subject of the product, but NOT the shape of the product: see "Stil (Style)".
Examples: 90er, Retro, Retro Vintage, Sportlich, Vintage
UK-Schuhgröße (UK Shoe Size)
Size of the shoes, using UK standard sizes. Note that the German language uses a comma "," where English uses a decimal point ".", for example the size UK 11,5 in German is UK 11.5 in English.
Examples: UK 10, UK 11, UK 11,5, UK 7
US-Schuhgröße (US Shoe Size)
Size of the shoes, using US standard sizes. Note that the German language uses a comma "," where English uses a decimal point ".", for example the size US 11,5 in German is US 11.5 in English.
Examples: US 10, US 11, US 11,5, US 12, US 8
Verschluss (Closure)
Type of closing mechanism the product uses.
Examples: Klett, Klettverschluss, Lace Up, Lace-Up, Reißverschluss, Schnür, Schnüren
Zwischensohlen-Typ (Midsole Type)
Type of supportive structure that is layered between the shoe's insole and outsole
Examples: Air, Cloudfoam, Croslite, Dämpfung, Federsohle, Foam, Memory Foam, Soft Foam

All of Assistant's communication is performed using this JSON format.

Here are some previous conversations between the Assistant and User:

User*: The listing title was Supreme Nike SB Dunk High By any Means Red US10 EU44 Supreme Box Logo Air Force. \nOutput the aspect name with its aspect values for this listing title.\n
\n###Assistant*: ```json
[{"Aspect Name": "Marke",
 "Aspect Value": "Nike"},
 {"Aspect Name": "Produktlinie",
 "Aspect Value": "SB Dunk"},
 {"Aspect Name": "Schuhschaft-Typ",
 "Aspect Value": "High"},
{"Aspect Name": "Modell",
 "Aspect Value": "By any Means"},
{"Aspect Name": "Farbe",
 "Aspect Value": "Red"},
{"Aspect Name": "US-Schuhgröße",
 "Aspect Value": "US10"},
{"Aspect Name": "EU-Schuhgröße",
 "Aspect Value": "EU44"},
 {"Aspect Name": "No Tag",
 "Aspect Value": "Supreme"},
 {"Aspect Name": "No Tag",
 "Aspect Value": "Box"},
 {"Aspect Name": "Akzente",
 "Aspect Value": "Logo"},
  {"Aspect Name": "Produktlinie",
 "Aspect Value": "Air Force"}]
```
Here is the latest conversation between Assistant and User."""


In [19]:
def prep_training_data(df):
    index = 0
    train_data = []
    while index < len(df):
        row = df.iloc[index]
        record_number = row['Record Number']
        title = row['Title']
        token = row['Token']
        tag = row['Tag']

        prompt = f"{sys_msg}\nUser: The listing title was {title}. \nOutput the aspect name with its aspect values for this listing title."
        prompt_overall=[]
        temp_index = index
        while temp_index < len(df) and df.iloc[temp_index]['Record Number'] == record_number:
          prompt_dict = {}
          prompt_dict["Aspect Name"]=df.iloc[temp_index]['Tag']
          prompt_dict["Aspect Value"]=df.iloc[temp_index]['Token']
          prompt_overall.append(json.dumps(prompt_dict))
          temp_index += 1
        index = temp_index
        train_example = "\n"+ "[" + ",\n".join(prompt_overall) + "]```"
        train_data.append([prompt, train_example])
    return train_data

torch_data = prep_training_data(modified)


**We need to be aware of the text encoding later. UTF encoding needs to be changed**

In [20]:
modified[modified['Record Number']==1]['Tag']

0              Modell
1               Marke
2        Produktlinie
3     Schuhschaft-Typ
4              Modell
5               Farbe
6       US-Schuhgröße
7       EU-Schuhgröße
8              No Tag
9              No Tag
10            Akzente
11       Produktlinie
Name: Tag, dtype: object

In [21]:
print(torch_data[0])
#Later, note we should encode the NLP forms more properly: US-Schuhgröße, EU-Schuhgröße
#https://towardsdatascience.com/character-encoding-in-nlp-the-role-of-ascii-and-unicode-9349b4fe3cee

['Assistant is a expert JSON builder designed to assist with a named entity recognition task.\nAssistant is able to respond to the User and use tools using JSON strings that contain "Aspect Name" and "Aspect Value" parameters. Aspect Name parameters can have the following:\nAbteilung, Aktivität, Akzente, Anlass, Besonderheiten, Charakter, Charakter Familie, Dämpfungsgrad, Erscheinungsjahr, EU-Schuhgröße, Farbe, Futtermaterial, Gewebeart, Herstellernummer, Herstellungsland und -region, Innensohlenmaterial, Jahreszeit, Laufsohlenmaterial, Marke, Maßeinheit, Modell, Muster, Obermaterial, Produktart, Produktlinie, Schuhschaft-Typ, Schuhweite, Stil, Stollentyp, Thema, UK-Schuhgröße, US-Schuhgröße, Verschluss, and Zwischensohlen-Typ are the aspect names that it can have. In addition, it can have two other tags: "No Tag" and "Obscure"; however, this should not be shown in the response as it does not contain any meaningful data.\nFollowing are descriptions of the aspect names and examples:\nAb

In [22]:
torch_pd=pd.DataFrame(torch_data).rename(columns={0:'prompt', 1:'completion'})

In [23]:
torch_pd

Unnamed: 0,prompt,completion
0,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Modell"", ""Aspect Value"": ""..."
1,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
2,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""No Tag"", ""Aspect Value"": ""..."
3,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
4,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
...,...,...
4995,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""A..."
4996,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""L..."
4997,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""A..."
4998,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""H..."


**Train-Test split of 80/20**

In [24]:
torch_pd = torch_pd.sample(frac = 1)
torch_training=torch_pd.iloc[0:4000]
torch_testing=torch_pd.iloc[4000:5000]

In [25]:
torch_pd

Unnamed: 0,prompt,completion
1416,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Abteilung"", ""Aspect Value""..."
1195,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
4153,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""L..."
220,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""A..."
3230,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Modell"", ""Aspect Value"": ""..."
...,...,...
2008,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""A..."
599,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Abteilung"", ""Aspect Value""..."
2742,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
2369,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""No Tag"", ""Aspect Value"": ""..."


In [26]:
torch_training

Unnamed: 0,prompt,completion
1416,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Abteilung"", ""Aspect Value""..."
1195,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
4153,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""L..."
220,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""A..."
3230,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Modell"", ""Aspect Value"": ""..."
...,...,...
1827,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""C..."
4104,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Abteilung"", ""Aspect Value""..."
2506,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""B..."
1378,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""a..."


In [27]:
torch_testing

Unnamed: 0,prompt,completion
299,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""S..."
1720,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""a..."
4975,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""T..."
908,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""L..."
3449,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
...,...,...
2008,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""A..."
599,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Abteilung"", ""Aspect Value""..."
2742,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
2369,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""No Tag"", ""Aspect Value"": ""..."


# Training Model

In [28]:
hf_dataset_training = Dataset.from_pandas(torch_training)
hf_dataset_testing = Dataset.from_pandas(torch_testing)
hf_dataset=Dataset.from_pandas(torch_pd)

In [29]:
hf_dataset

Dataset({
    features: ['prompt', 'completion', '__index_level_0__'],
    num_rows: 5000
})

In [30]:
hf_dataset_training

Dataset({
    features: ['prompt', 'completion', '__index_level_0__'],
    num_rows: 4000
})

In [31]:
hf_dataset_testing

Dataset({
    features: ['prompt', 'completion', '__index_level_0__'],
    num_rows: 1000
})

**Code below shows how we should use seqeval for F1 score**

In [32]:
seqeval = evaluate.load('seqeval')
predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] #우리가 predict하는 것
references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]#prompt에 있는 것
results = seqeval.compute(predictions=predictions, references=references)

In [33]:
#Separating by the response template(Supervised finetuning- if we don't separate, it will seem like our loss is so small because we are predicting the whole instruction+output and instruction is easy to learn bc its same throughout)

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"### Question: {example['prompt'][i]}\n### Assistant: {example['completion'][i]}   \n\n\n```\n\n\n"
        output_texts.append(text)
    return output_texts

response_template_with_context = "\n### Assistant:"
response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`
data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

**Since our project will be graded on the F1 score, we need a custom loss function.**

https://huggingface.co/spaces/evaluate-metric/seqeval/resolve/cb29a68788f8d2ccf3f0b7050e3ecbf5202cf786/README.md

In [34]:
def train(model, tokenizer, training_dataset, testing_dataset,  output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names (change it to only allow lora to apply to attention layer instead of all linear layers)
    modules = find_all_linear_names(model)


    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    per_device_train_batch_size=1

    # Training parameters (In addition to Trainer class capabilities ,SFTTrainer also providing parameter-efficient (peft ) and packing optimizations.)
    trainer = SFTTrainer(
        model=model,
        train_dataset=training_dataset,
#        eval_dataset=testing_dataset,
        formatting_func=formatting_prompts_func,
        data_collator=data_collator,
        max_seq_length=4096,
        args=TrainingArguments(
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=1,
            warmup_steps=2,
            max_steps=20,
            learning_rate=2e-4,
            #evaluation_strategy="steps",
            per_device_eval_batch_size=4,
       #     eval_steps=1,
            fp16=False,
            bf16=False,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
            #remove_unused_columns=False,
        ),
    )

    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs

    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training

    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)

    do_train = True
    # Launch training
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    ###


    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    #del model
    #del trainer
    #torch.cuda.empty_cache()


output_dir = "results/llama2/final_checkpoint"
train(model, tokenizer, hf_dataset_training, hf_dataset_testing, output_dir)

all params: 3,660,320,768 || trainable params: 159,907,840 || trainable%: 4.368683788535114


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

torch.float32 422318080 0.11537734170515189
torch.uint8 3238002688 0.8846226582948481
Training...


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,0.4485
2,0.4299
3,0.3928
4,0.2178
5,0.1324
6,0.2913
7,0.299
8,0.2256
9,0.2646
10,0.1554


***** train metrics *****
  epoch                    =       0.01
  total_flos               =  2248855GF
  train_loss               =     0.2207
  train_runtime            = 0:04:11.19
  train_samples_per_second =       0.08
  train_steps_per_second   =       0.08
{'train_runtime': 251.1916, 'train_samples_per_second': 0.08, 'train_steps_per_second': 0.08, 'total_flos': 2414690664529920.0, 'train_loss': 0.22070885486900807, 'epoch': 0.01}
Saving last checkpoint of the model...


In [35]:
#Training loss spike- probs where there are annotation errors
#더 improve하려면 이런 training loss spike가 있을때 이걸 skip해야됨
#check mid checkpoint

In [36]:
torch_testing

Unnamed: 0,prompt,completion
299,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""S..."
1720,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""a..."
4975,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""T..."
908,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""L..."
3449,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
...,...,...
2008,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""A..."
599,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Abteilung"", ""Aspect Value""..."
2742,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""Marke"", ""Aspect Value"": ""N..."
2369,Assistant is a expert JSON builder designed to...,"\n[{""Aspect Name"": ""No Tag"", ""Aspect Value"": ""..."


In [37]:
torch_testing['prompt'].iloc[0]

'Assistant is a expert JSON builder designed to assist with a named entity recognition task.\nAssistant is able to respond to the User and use tools using JSON strings that contain "Aspect Name" and "Aspect Value" parameters. Aspect Name parameters can have the following:\nAbteilung, Aktivität, Akzente, Anlass, Besonderheiten, Charakter, Charakter Familie, Dämpfungsgrad, Erscheinungsjahr, EU-Schuhgröße, Farbe, Futtermaterial, Gewebeart, Herstellernummer, Herstellungsland und -region, Innensohlenmaterial, Jahreszeit, Laufsohlenmaterial, Marke, Maßeinheit, Modell, Muster, Obermaterial, Produktart, Produktlinie, Schuhschaft-Typ, Schuhweite, Stil, Stollentyp, Thema, UK-Schuhgröße, US-Schuhgröße, Verschluss, and Zwischensohlen-Typ are the aspect names that it can have. In addition, it can have two other tags: "No Tag" and "Obscure"; however, this should not be shown in the response as it does not contain any meaningful data.\nFollowing are descriptions of the aspect names and examples:\nAbt

In [38]:
torch_testing['completion'].iloc[0]

'\n[{"Aspect Name": "Marke", "Aspect Value": "Skechers"},\n{"Aspect Name": "Abteilung", "Aspect Value": "Men"},\n{"Aspect Name": "Herstellungsland und -region", "Aspect Value": "USA"},\n{"Aspect Name": "Modell", "Aspect Value": "PARTON WILCON"},\n{"Aspect Name": "Stil", "Aspect Value": "Sneakers"},\n{"Aspect Name": "Abteilung", "Aspect Value": "Herren"},\n{"Aspect Name": "Produktart", "Aspect Value": "Schuhe"},\n{"Aspect Name": "Farbe", "Aspect Value": "Schwarz"}]```'

In [39]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (k_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, o

In [40]:
example_prompt=torch_testing['prompt'].iloc[0]

In [41]:
7521 in tokenizer(torch_testing['completion'].iloc[0][7:])['input_ids']

False

In [42]:
print(tokenizer(torch_testing['completion'].iloc[0][7:]))

{'input_ids': [321, 312, 4408, 1115, 376, 7083, 446, 613, 376, 2887, 1103, 7865, 1115, 376, 29903, 446, 305, 414, 10758, 13, 6377, 2887, 1103, 4408, 1115, 376, 4920, 18958, 613, 376, 2887, 1103, 7865, 1115, 376, 28154, 10758, 13, 6377, 2887, 1103, 4408, 1115, 376, 18650, 6236, 3085, 1049, 563, 448, 12803, 613, 376, 2887, 1103, 7865, 1115, 376, 27019, 10758, 13, 6377, 2887, 1103, 4408, 1115, 376, 2111, 514, 613, 376, 2887, 1103, 7865, 1115, 376, 26092, 1164, 399, 6227, 6007, 10758, 13, 6377, 2887, 1103, 4408, 1115, 376, 855, 309, 613, 376, 2887, 1103, 7865, 1115, 376, 29903, 484, 21079, 10758, 13, 6377, 2887, 1103, 4408, 1115, 376, 4920, 18958, 613, 376, 2887, 1103, 7865, 1115, 376, 18650, 1267, 10758, 13, 6377, 2887, 1103, 4408, 1115, 376, 23665, 1193, 442, 613, 376, 2887, 1103, 7865, 1115, 376, 4504, 29884, 354, 10758, 13, 6377, 2887, 1103, 4408, 1115, 376, 29943, 23536, 613, 376, 2887, 1103, 7865, 1115, 376, 4504, 4495, 29920, 29908, 6525, 28956, 2], 'attention_mask': [1, 1, 1, 1, 1,

In [43]:
#llama-2의 tokenizer이 ```를 28956이랑 7521로 encoding하고 있었음 (unique하지 않게).. 그래서 7521로 첨에 하다가 안먹혀서 28956으로 바꿈
tokenizer.decode([28956])

'```'

In [44]:
tokenizer.decode([7521])

'```'

In [45]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            stop_ids=torch.tensor(stop_ids)
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False
stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [46]:
stop_token_ids=[[28956]]

In [47]:
#not working했던 코드 huggingface의 pipeline이 문제였음
#from transformers import pipeline
#max_length = get_max_length(model)
#gen = pipeline('text-generation', stopping_criteria=stopping_criteria, model=model, tokenizer=tokenizer,max_length=max_length)
#result = gen(example_prompt)
#print(result[0]['generated_text'])

In [48]:
#padding right shift-> tensor length가 다를 경우 (input length)-- 만약 많은 prediction을 같이 하려면 (ways to improve the efficiency time)

In [49]:
#pipeline이상해서 다시 적음 (training dataset으로 처음 testing 시도)-works properly
#training_dataloader = DataLoader(hf_dataset_training, batch_size=2, shuffle=True)
#batch = next(iter(training_dataloader))
#prompt=batch['prompt']
#prompt_1=[f"### Question: {prompt[0]}\n### Assistant:"]
#input_ids = tokenizer(prompt_1, return_tensors="pt",add_special_tokens=False).input_ids
#return_tensors="pt": it outputs to the tensor format for us
#outputs = model.generate(input_ids, stopping_criteria=stopping_criteria, do_sample=False, max_length=3000)
#tokenizer.batch_decode(outputs, skip_special_tokens=False)

In [50]:
#(testing dataset으로 처음 testing 시도) works properly
#test_dataloader = DataLoader(hf_dataset_testing, batch_size=2, shuffle=True)
#batch = next(iter(test_dataloader))
#prompt=batch['prompt']
#prompt_1=[f"### Question: {prompt[0]}\n### Assistant:"]
#input_ids = tokenizer(prompt_1, return_tensors="pt",add_special_tokens=False).input_ids
#outputs = model.generate(input_ids, stopping_criteria=stopping_criteria, do_sample=False, max_length=3000)
#tokenizer.batch_decode(outputs, skip_special_tokens=False)

In [51]:
#(testing dataset으로 처음 testing 시도- max_length 바꿈)- works properly
#test_dataloader = DataLoader(hf_dataset_testing, batch_size=2, shuffle=True)
#batch = next(iter(test_dataloader))
#prompt=batch['prompt']
#prompt_1=[f"### Question: {prompt[0]}\n### Assistant:"]
#input_ids = tokenizer(prompt_1, return_tensors="pt",add_special_tokens=False).input_ids
#outputs = model.generate(input_ids, stopping_criteria=stopping_criteria, do_sample=False, max_length=get_max_length(model))
#tokenizer.batch_decode(outputs, skip_special_tokens=False)

In [52]:
#base_model = AutoModelForCausalLM.from_pretrained(“base_model”, load_in_8bit=True, torch_dtype=torch.float16, device_map=“auto”)

#base_model = prepare_model_for_int8_training(base_model)

#peft_model = get_peft_model(base_model, peft_config)

#training_args = TrainingArguments()
#trainer = Trainer()
#trainer.train()

#peft_model.save_pretrained(lora_adapter, save_adapter=True, save_config=True)

#model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(base_model).to(“cuda”), lora_adapter)

#merged_model = model_to_merge.merge_and_unload()
#merged_model.save_pretrained(merged_model)

In [53]:
import re
def find_assistant_occurrence(main_string, substring):
  return main_string.find(substring)

In [55]:
import time
from torch.utils.data import DataLoader
max_length_model=get_max_length(model)
test_dataloader = DataLoader(hf_dataset_testing, batch_size=1, shuffle=True)
generated_texts=[]
i=0
df_generated_texts = pd.DataFrame(columns=['generated_text'])
tokenizer.padding_side = "left"
test_iterator=iter(test_dataloader)
dummy=next(test_iterator)
for batch in test_iterator:
  start=time.time()
  prompts=batch['prompt']
  prompt_1=[f"### Question: {prompt}\n### Assistant:" for prompt in prompts]
  input_ids = tokenizer(prompt_1, return_tensors="pt",add_special_tokens=False,padding='longest').input_ids
  outputs = model.generate(input_ids, stopping_criteria=stopping_criteria, do_sample=False, max_length=max_length_model, pad_token_id=1)
  decoded_prompt=tokenizer.batch_decode(outputs, skip_special_tokens=False)
  assistant_index=find_assistant_occurrence(decoded_prompt[0],"Assistant:")
  print(i)
  end=time.time()
  time_taken=end-start
  print("time taken", time_taken)
  i += 1
  generated_texts.append(decoded_prompt[0][assistant_index:])
  if i == 6:
    break
df_generated_texts = pd.DataFrame({'generated_text': generated_texts})
df_generated_texts.to_csv('/content/gdrive/My Drive/Colab Notebooks/eBay_Competition/test_dataset_trial.csv', index=False, encoding='utf-8')

Found max lenth: 4096
tensor([[  835,   894, 29901,  ...,  4007, 22137, 29901]])




0
time taken 31.51076912879944
tensor([[  835,   894, 29901,  ...,  4007, 22137, 29901]])
1
time taken 32.942347288131714
tensor([[  835,   894, 29901,  ...,  4007, 22137, 29901]])
2
time taken 28.426798105239868
tensor([[  835,   894, 29901,  ...,  4007, 22137, 29901]])
3
time taken 21.860998153686523
tensor([[  835,   894, 29901,  ...,  4007, 22137, 29901]])
4
time taken 28.58710765838623
tensor([[  835,   894, 29901,  ...,  4007, 22137, 29901]])
5
time taken 23.709275722503662


In [None]:
#Just try inference for first 6 rows of the test dataset for post-processing purposes
#Batch size of 2 Error-not working
"""from torch.utils.data import DataLoader
max_length_model=get_max_length(model)
test_dataloader = DataLoader(hf_dataset_testing, batch_size=2, shuffle=True)
generated_texts=[]
i=0
df_generated_texts = pd.DataFrame(columns=['generated_text'])
tokenizer.padding_side = "left"
test_iterator=iter(test_dataloader)
dummy=next(test_iterator)
for batch in test_iterator:
  prompts=batch['prompt']
  prompt_1=[f"### Question: {prompt}\n### Assistant:" for prompt in prompts]
  input_ids = tokenizer(prompt_1, return_tensors="pt",add_special_tokens=False,padding='longest').input_ids
  print(input_ids)
  outputs = model.generate(input_ids, stopping_criteria=stopping_criteria, do_sample=False, max_length=max_length_model, pad_token_id=1)
  decoded_prompt=tokenizer.batch_decode(outputs, skip_special_tokens=False)
  assistant_index=find_assistant_occurrence(decoded_prompt[0],"Assistant:")
  print(i)
  i += 1
  generated_texts.append(decoded_prompt[0][assistant_index:])
  if i == 6:
    break
df_generated_texts = pd.DataFrame({'generated_text': generated_texts})
df_generated_texts.to_csv('/content/gdrive/My Drive/Colab Notebooks/eBay_Competition/test_dataset_trial.csv', index=False, encoding='utf-8')"


In [None]:
#Used for deleting gen for freeing up space
#del gen

In [None]:
"""import re
generated_texts=[]
gen = pipeline('text-generation', model=model, tokenizer=tokenizer, stopping_criteria=stopping_criteria, max_length=max_length)
for i in torch_testing['prompt']:
  result = gen(i)
  generated_texts.append(result)
df_generated_texts = pd.DataFrame({'generated_text':generated_texts})
df_generated_texts.to_csv('output_file.csv', index=False, encoding='utf-8')"""

In [None]:
#retrain with the testing set
"""train(model, tokenizer, hf_dataset_testing, hf_dataset_testing, output_dir)"""

In [None]:
"""The Quiz Data consists of records 5001 to 30000 of the listing data, inclusively.
quiz = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/eBay_Competition/Listing_Titles.tsv', sep='\t',quoting=3)
quiz_data=quiz.iloc[5001:30001]
modified_quiz = modified_dataframe(quiz_data)
torch_data_quiz = prep_training_data(modified_quiz)
torch_pd_quiz=pd.DataFrame(torch_data_quiz).rename(columns={0:'prompt', 1:'completion'})
hf_dataset_quiz=Dataset.from_pandas(torch_pd_quiz)"""

In [None]:
"""generated_texts_quiz=[]
prompt_quiz=[]
record_quiz=[]
gen = pipeline('text-generation', model=model, tokenizer=tokenizer, stopping_criteria=stopping_criteria, max_length=max_length)
for i in torch_data_quiz['prompt']:
  result = gen(i)
  prefix_match = re.search(r"User: The listing title was", i)
  record_number=torch_data_quiz.index
  prompt_quiz.append(prefix_match)
  generated_texts_quiz.append(result[0]['generated_text'])
  record_quiz.append(record_number)
new_generated_aspect_name=[]
new_generated_aspect_value=[]
new_generated_record_number=[]
for i,j in zip(generated_texts_quiz, record_quiz):
  aspect_name=i.get("Aspect Name", [])
  aspect_values=i.get("Aspect Value", [])
  new_generated_aspect_name.append(aspect_name)
  new_generated_aspect_value.append(aspect_values)
  new_generated_record_number.append(j*len(aspect_name))
df_generated_texts_quiz = pd.DataFrame({'record_number':new_generated_record_number, 'Aspect Name':new_generated_aspect_name,'Aspect Value':new_generated_aspect_value})
result_df_quiz = df_generated_texts_quiz[df_generated_texts_quiz['Aspect Name'] != 'No Tag']
result_df_quiz.to_csv('output_file.csv', index=False,, encoding='utf-8')"""

In [None]:
"""def train_overall(model, tokenizer, training_dataset, testing_dataset,  output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    #Separating by the response template(Supervised finetuning- if we don't separate, it will seem like our loss is so small because we are predicting the whole instruction+output and instruction is easy to learn bc its same throughout)
    response_template = "Assistant:"
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

    # Training parameters (In addition to Trainer class capabilities ,SFTTrainer also providing parameter-efficient (peft ) and packing optimizations.)
    trainer = SFTTrainer(
        model=model,
        train_dataset=training_dataset,
        eval_dataset=testing_dataset,
        data_collator=collator,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=20,
            learning_rate=2e-4,
            evaluation_strategy="steps",
            per_device_eval_batch_size=4,
            eval_steps=5,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
            #remove_unused_columns=False,
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs

    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training

    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)

    do_train = True

    # Launch training
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    ###

    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()


output_dir = "results/llama2/final_checkpoint"
train(model, tokenizer, hf_dataset_testing, hf_dataset_testing, output_dir)"""

In [None]:
"""torch.cuda.empty_cache()

In [None]:
"""model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = "results/llama2/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True)

# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)"""