## Installing Necessary Libraries

In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

[0m

In [2]:
!pwd

/home/katonic/e-commerce-usecases/Recommendation_sys_llama2


In [15]:
import GPUtil
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------


# Dataset details
Instacart Data can be downloaded from [here](https://www.kaggle.com/competitions/instacart-market-basket-analysis/data). We just need product & department csv files

In [7]:
import pandas as pd
df_product = pd.read_csv("/home/katonic/e-commerce-usecases/Recommendation_sys_llama2/data/products.csv")
df_dept = pd.read_csv('/home/katonic/e-commerce-usecases/Recommendation_sys_llama2/data/departments.csv')

In [8]:
df_joined = pd.merge(df_product, df_dept, on = ['department_id'])
df_joined['text'] = df_joined.apply(lambda row: row['product_name'] + " ->: " + row['department'], axis = 1)

In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_joined, test_size=0.2, random_state=42)

In [11]:
train_df.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,text
7361,10376,Organic Caraway Seeds,104,13,pantry,Organic Caraway Seeds ->: pantry
32534,40131,Original Laundry Detergent,75,17,household,Original Laundry Detergent ->: household
1457,11913,Shelled Pistachios,117,19,snacks,Shelled Pistachios ->: snacks
5201,41392,Harvest Berry Chewy Granola Bars,3,19,snacks,Harvest Berry Chewy Granola Bars ->: snacks
38539,28627,Veganic Sprouted Red Fife Raisin Bran,121,14,breakfast,Veganic Sprouted Red Fife Raisin Bran ->: brea...
18377,29407,Root Vegetable Cakes,42,1,frozen,Root Vegetable Cakes ->: frozen
28627,29571,Alta Dena 1% Milk,84,16,dairy eggs,Alta Dena 1% Milk ->: dairy eggs
30283,3589,Laundry Detergent Pods,75,17,household,Laundry Detergent Pods ->: household
32030,31685,"Daily Shower Cleaner Scrub Free Refill, Fresh ...",114,17,household,"Daily Shower Cleaner Scrub Free Refill, Fresh ..."
28655,30119,Organic French Style Meyer Lemon Yogurt,120,16,dairy eggs,Organic French Style Meyer Lemon Yogurt ->: da...


In [12]:
test_df.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,text
33626,24154,Free & Clear Stage 4 Overnight Diapers,56,18,babies,Free & Clear Stage 4 Overnight Diapers ->: babies
18192,27348,"Beef pot roast with roasted potatoes, carrots,...",38,1,frozen,"Beef pot roast with roasted potatoes, carrots,..."
47099,27181,Coffee Liquer,124,5,alcohol,Coffee Liquer ->: alcohol
48183,20577,Bread Rolls,43,3,bakery,Bread Rolls ->: bakery
22197,16472,French Milled Oval Almond Gourmande Soap,25,11,personal care,French Milled Oval Almond Gourmande Soap ->: p...
31573,24121,Dust Pan,114,17,household,Dust Pan ->: household
45362,5477,Roasted Pine Nut Hommus,67,20,deli,Roasted Pine Nut Hommus ->: deli
14131,27921,Cranberry Raspberry Juice Cocktail,98,7,beverages,Cranberry Raspberry Juice Cocktail ->: beverages
26903,4786,Sweet Cream Butter Salted,36,16,dairy eggs,Sweet Cream Butter Salted ->: dairy eggs
39417,8796,Traditional Chicken Barley Soup,69,15,canned goods,Traditional Chicken Barley Soup ->: canned goods


In [13]:
from datasets import Dataset,DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
})

  from .autonotebook import tqdm as notebook_tqdm


## Loading the model

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

Downloading (…)lve/main/config.json: 100%|██████████| 626/626 [00:00<00:00, 104kB/s]
  warn("The installed version of bitsandbytes was compiled without GPU support. "


/opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


RuntimeError: No GPU found. A GPU is needed for quantization.

Let's also load the tokenizer below

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

**Let's check what the base model predicts before finetuning. :)**

In [None]:
import transformers
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)


sequences = pipeline(
   ["“Free & Clear Stage 4 Overnight Diapers” ->:","Bread Rolls ->:","French Milled Oval Almond Gourmande Soap ->:"],
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq[0]['generated_text']}")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Result: “Free & Clear Stage 4 Overnight Diapers” ->: http://www. Einzeln: 32,56 â‚¬.
The price of the “Free & Clear Overnight Pull-Ups” is 34.48 â‚¬, the price of the “Free & Clear Stage 2 Overnight Pull-Ups” is 30.26 â‚¬.
The offer is only valid until 23.07.2005 and is available at the following online shops:
http://www.aldi-sued.de
http://www.aldi-nord.de
http://www.aldi-sued.at
The offer is available at the following webshop
http://www.aldi-sued.at
The following webshops are offering the product “Free & Clear Overnight Pull-Ups”
Result: Bread Rolls ->:
 kwietnia 09, 2015 at 7:21 am
You’re so interesting! I do not suppose I have read through a
single thing like that before. So good to find another person with a few unique thoughts on this issue.
Serwis Komputerowy Lublin
april 18, 2015 at 3:06 am
It’s remarkable to go to seveг a great site
that offers helpful information about ѕex.
Sweetie 3D
aprill 18, 2015 at 4:02 pm
What i do not understand is in truth how you’re no longer
actuall

Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj"]
)

## Loading the trainer

Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 120
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

Then finally pass everthing to the trainer

In [None]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_dict['train'],
    # train_dataset=data['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    # dataset_text_field="prediction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)



Map:   0%|          | 0/39750 [00:00<?, ? examples/s]

We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

## Train the model

Now let's train the model! Simply call `trainer.train()`

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.0291
2,3.0896
3,4.3592
4,6.2712
5,4.2501
6,3.0927
7,4.5173
8,3.9694
9,4.4714
10,5.3573


TrainOutput(global_step=120, training_loss=2.6939144556721053, metrics={'train_runtime': 410.6557, 'train_samples_per_second': 4.675, 'train_steps_per_second': 0.292, 'total_flos': 627303342243840.0, 'train_loss': 2.6939144556721053, 'epoch': 0.05})

In [None]:
lst_test_data = list(test_df['text'])

In [None]:
len(lst_test_data)

9938

In [None]:
sample_size = 25
lst_test_data_short = lst_test_data[:sample_size]

In [None]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # torch_dtype=torch.bfloat16,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

sequences = pipeline(
    lst_test_data_short,
    max_length=100,  #200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

for ix,seq in enumerate(sequences):
    print(ix,seq[0]['generated_text'])

0 Free & Clear Stage 4 Overnight Diapers ->: babies: diapers & accessories diapers ->: baby needs diapers & pads free clear diapers ->: baby needs diapers free & clear stage 4 overnight diapers: diapers ->: baby needs diapers diapers & accessories ->: baby needs diapers free & clear stage 4 overnight diapers: diapers ->: personal care baby care ->: diapers & accessories dia
1 Beef pot roast with roasted potatoes, carrots, sweet onions, green beans, and a rich gravy Beef Pot Roast ->: frozen goods ->: frozen meals ->: meat, chicken & seafood ->: beef & beef products ->: entree ->: entree ->: entree ->: entrees: beef ->: beef entree ->: meat ->: beef ->: meat, chicken
2 Coffee Liquer ->: alcoholic beverages spirits ->: beverages beer & hard cider ->: beer coffee liqueur ->: beverages liquor ->: beverages beer & hard cider coffee liquor ->: beverages beer coffee liqueur original ->: beverages liquor ->: beverages beer & hard cider coffee liqueur original alcoholic beverages spirits drinks

In [None]:
def correct_answer(ans):
  return (ans.split("->:")[1]).strip()

answers = []
for ix,seq in enumerate(sequences):
    # print(ix,seq[0]['generated_text'])
    answers.append(correct_answer(seq[0]['generated_text']))

answers

['babies: diapers & accessories diapers',
 'frozen goods',
 'alcoholic beverages spirits',
 'bakery',
 'personal care soap & bath',
 'household: pantry: cleaning: broom & mop: broom & broom brushes: hand broom',
 'deli meats & cheese',
 'beverages alcohol juice cranberry raspberry juice cocktail',
 'dairy eggs butter & cheese eggs',
 'canned goods',
 'dairy eggs & dairy eggs',
 'snacks chocolate & candy',
 'dry goods pasta, rice, beans and grains pasta',
 'personal care toiletries personal care toiletries liquid hand wash & dish wash',
 'beverages',
 'international foods dry goods spices & seasonings spices five spices',
 'breakfast cereals',
 'dairy eggs & cheese frozen dairy',
 'meat seafood frozen',
 'household essentials candles & accessories scented votive wax tablets',
 'frozen food',
 'dairy eggs dairy egg products soy products',
 'breakfast foods',
 'personal care',
 'meat seafood fish seafood']

In [None]:
df_evaluate = test_df.iloc[:sample_size][['product_name','department']]

df_evaluate = df_evaluate.reset_index(drop=True)

df_evaluate['department_predicted'] = answers

df_evaluate

Unnamed: 0,product_name,department,department_predicted
0,Free & Clear Stage 4 Overnight Diapers,babies,babies: diapers & accessories diapers
1,"Beef pot roast with roasted potatoes, carrots,...",frozen,frozen goods
2,Coffee Liquer,alcohol,alcoholic beverages spirits
3,Bread Rolls,bakery,bakery
4,French Milled Oval Almond Gourmande Soap,personal care,personal care soap & bath
5,Dust Pan,household,household: pantry: cleaning: broom & mop: broo...
6,Roasted Pine Nut Hommus,deli,deli meats & cheese
7,Cranberry Raspberry Juice Cocktail,beverages,beverages alcohol juice cranberry raspberry ju...
8,Sweet Cream Butter Salted,dairy eggs,dairy eggs butter & cheese eggs
9,Traditional Chicken Barley Soup,canned goods,canned goods
