In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,TrainingArguments
from peft import LoraConfig
import torch
from datasets import load_dataset
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
USE_GPU = True
if USE_GPU:
    device = "cuda:0"
else:
    device = "cpu"

# you may want to change the following parameters depending on your GPU configuration

# free T4 instance
# QUANTIZE_4BIT = True
# USE_GRAD_CHECKPOINTING = True
# TRAIN_BATCH_SIZE = 2
# TRAIN_MAX_SEQ_LENGTH = 512
# USE_FLASH_ATTENTION = False
# GRAD_ACC_STEPS = 16

# equivalent A100 setting
QUANTIZE_4BIT = True
USE_GRAD_CHECKPOINTING = True
TRAIN_BATCH_SIZE = 16
TRAIN_MAX_SEQ_LENGTH = 512
USE_FLASH_ATTENTION = True
GRAD_ACC_STEPS = 2

In [3]:
MODEL_NAME = "CohereForAI/aya-expanse-8b"
models_path = "/data1/debajyoti/test/llms/models/"

attn_implementation = None
if USE_FLASH_ATTENTION:
  attn_implementation="flash_attention_2"

model = AutoModelForCausalLM.from_pretrained(
          MODEL_NAME,
          attn_implementation=attn_implementation,
          torch_dtype=torch.bfloat16,
          cache_dir=models_path,
          device_map="auto"
        )
# model = model.to(device)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.32s/it]


In [4]:
def get_message_format(prompts):
  messages = []

  for p in prompts:
    messages.append(
          [{"role": "system", "content": """You are a sarcasm recognition assistant judging if an 'Input' is sarcastic or not. No need to consider any prior context.
            If the 'Input' is sarcastic, you need to output your final 'Output' as 'Sarcasm'. 
            If the 'Input' is non-sarcastic, you need to output your final 'Output' as 'Non-sarcasm'."""},
          {"role": "user", "content": p}]
      )

  return messages

def generate_aya(
      model,
      prompts,
      temperature=0.75,
      top_p=1.0,
      top_k=0,
      max_new_tokens=1024
    ):

  # print(prompts)
  messages = get_message_format(prompts)
  # print(messages)

  input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        padding=True,
        return_tensors="pt",
      )
  input_ids = input_ids.to(model.device)
  prompt_padded_len = len(input_ids[0])

  gen_tokens = model.generate(
        input_ids,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        max_new_tokens=max_new_tokens,
        do_sample=True,
      )

  # get only generated tokens
  gen_tokens = [
      gt[prompt_padded_len:] for gt in gen_tokens
    ]

  gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
  return gen_text

In [5]:
# Test generations on langauges in Aya 23 set
prompts = [
    """Input: "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi" 
    Output: """
]

generations = generate_aya(model, prompts)

for p, g in zip(prompts, generations):
  print(
      "PROMPT", p ,"RESPONSE", g, "\n", sep="\n"
    )

PROMPT
Input: "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi" 
    Output: 
RESPONSE
Output: Sarcasm

The statement implies that if the Vice President were not Muslim, the protocol issue would have never been identified or addressed. This suggests a sarcastic tone, implying that the speaker is making a joke or pointing out a prejudiced or ignorant comment.




In [6]:
import pandas as pd
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
train_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/sarcasm/train.csv")
val_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/sarcasm/val.csv")
test_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/sarcasm/test.csv")

test_df

Unnamed: 0,Sentence,Tag
0,"Mera bhanja mujhe ""papa ka sala"" bulata hai......",0
1,Tu sale TRIPLE TALAQ nd HALALA Ke baad ki sant...,0
2,aur aap a0ni politics karne me vyast hai.. #Bi...,0
3,Theek hai waha at least Janki ma ka mandir hai...,0
4,Sir mujhe cricket acdemy join karna hai mai ka...,0
...,...,...
520,aaj bjp aur chadhi gang aise khus ho rahe hai ...,0
521,@SGanguly99 sir aapnio jodi aamar saath na den...,0
522,#Irony: Pran ka pran chala gaya.. RIP Sher Khan!,1
523,Ye bhi mat bhulo ki triple talaq ko ban lagane...,0


In [13]:
df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/outputs/sarcasm/few_shots/iacv2_eng_llama.csv")
df

Unnamed: 0,Sentence,Tag
0,Thats where you are wrong ..again.\r\nAsk your...,Sarcasm
1,Right so...1) God will kill you before he give...,Non-sarcasm
2,And lets not neglect the fact that he was teto...,Sarcasm
3,"Because, not once, did I have feel any sort of...",Non-sarcasm
4,A tenth grade education? Why am I not surprise...,Sarcasm
5,I will stop quoting here. Don't want people to...,Non-sarcasm
6,oh yes liberal: anyone who disagrees with vor ...,Sarcasm
7,jp: you keep spouting the same inaccuracy. i h...,Non-sarcasm


In [14]:
# Convert dataframe to specified format
formatted_few_shot_examples = []
for _, row in df.iterrows():
    formatted_few_shot_examples.append({'role': 'user', 'content': row['Sentence']})
    formatted_few_shot_examples.append({'role': 'assistant', 'content': row['Tag']})
    
formatted_few_shot_examples

[{'role': 'user',
  'content': 'Thats where you are wrong ..again.\r\nAsk your friend poet what jebus would do.I havent a clue.\r\nYou seem to be frustrated though......is it because you cannot find evidence that I defended ID but dont have the nuts to admit it?'},
 {'role': 'assistant', 'content': 'Sarcasm'},
 {'role': 'user',
  'content': "Right so...1) God will kill you before he gives you a chance to repent? Thus sending you straight to hell. I like that!2) God seems to make life unfair for most everyone. Hey, even in America. Consider in America, and extrememly religious state, you still have the highest homocide rates out of ALL the western countries. (I'm pretty sure of this.)3) Left behind? So, wait. The riegtous go the heaven, and everyone else remains leading a normal, religion free life on earth? Hmmm...4) Hebrews?5) Can it help? The gospels tell a story about a carpenter who lived 2000 years ago, and may or may not have been the son of god. Yeah, I can see how that can help

In [15]:
formatted_few_shot_examples = ""
for index, row in df.iterrows():
    # if index == 3: # or index == 4:
    #     formatted_few_shot_examples += f"""'Input': "{row['Sentence']}"\n'Response':"{row['Tag']}"\n\n"""
    formatted_few_shot_examples += f"""'Input': "{row['Sentence']}"\n'Response':"{row['Tag']}"\n\n"""
    # if index == 4:
    #     break

# Display the result
print(formatted_few_shot_examples)

'Input': "Thats where you are wrong ..again.
Ask your friend poet what jebus would do.I havent a clue.
You seem to be frustrated though......is it because you cannot find evidence that I defended ID but dont have the nuts to admit it?"
'Response':"Sarcasm"

'Input': "Right so...1) God will kill you before he gives you a chance to repent? Thus sending you straight to hell. I like that!2) God seems to make life unfair for most everyone. Hey, even in America. Consider in America, and extrememly religious state, you still have the highest homocide rates out of ALL the western countries. (I'm pretty sure of this.)3) Left behind? So, wait. The riegtous go the heaven, and everyone else remains leading a normal, religion free life on earth? Hmmm...4) Hebrews?5) Can it help? The gospels tell a story about a carpenter who lived 2000 years ago, and may or may not have been the son of god. Yeah, I can see how that can help. Why can't I worship the Flying Spaghetti Monster?"
'Response':"Non-sarcasm

### Create prompt, parse response and  generate response

In [16]:
from transformers import pipeline
import re
import json

# Define the function to create the prompt
def create_prompt(input, num_examples):

    # messages = [system_prompt + formatted_few_shot_examples, f"\'Input\':'{input}'\n\'Response\':"]
    messages = f"""
                {formatted_few_shot_examples}
                \'Input\':'{input}'
                \'Response\':
    """
    # messages = [system_prompt, input]

    return messages

def parse_response(response):

    # print(response)
    if len(response) > 0:        
        if "Non-sarcasm" in response:
            return "0"
        elif "Sarcasm" in response:
            return "1"
        else:# If no label is found, return None
            return None


In [17]:
def process_data(output_filepath):
    # df = pd.read_csv(data_filepath)
    df = test_df
    # print(df)
    # df = df[:5]
    
    
    results = []
    num_examples = 15 # number of shots in few-shot
    # Loop over the DataFrame and pass the text to generate response
    for index, row in tqdm(list(df.iterrows())):
        # print(row)
        prompt = create_prompt(row['Sentence'], num_examples)
        # print(prompt)
        generations = generate_aya(model, [prompt])
        # print(generations[0])
        generated_label = parse_response(generations[0])
        # print(generated_label)

        # # veracity, explanation = parse_response(model_response)
        
        results.append({
            'Sentence': row['Sentence'],
            'Label': row['Tag'],
            'Response': generations[0],
            'Gen_label': generated_label,
            # 'Explanation': explanation
        })
        # break
    
    results = pd.DataFrame(results)
    
    return results

# Example file paths, replace with your actual paths
output_filepath = '/data1/debajyoti/test/llms/data/humor'

# Run the data processing
result = process_data(output_filepath)

100%|██████████| 525/525 [31:17<00:00,  3.58s/it]


In [18]:
result

Unnamed: 0,Sentence,Label,Response,Gen_label
0,"Mera bhanja mujhe ""papa ka sala"" bulata hai......",0,To determine whether the 'Input' is sarcastic ...,1
1,Tu sale TRIPLE TALAQ nd HALALA Ke baad ki sant...,0,To determine whether an 'Input' is sarcastic o...,0
2,aur aap a0ni politics karne me vyast hai.. #Bi...,0,"Based on the context and tone, the final 'Resp...",1
3,Theek hai waha at least Janki ma ka mandir hai...,0,"Based on the context and tone, the final 'Inpu...",1
4,Sir mujhe cricket acdemy join karna hai mai ka...,0,"'Output': ""Sarcasm""\n\nThe input is in Hindi a...",1
...,...,...,...,...
520,aaj bjp aur chadhi gang aise khus ho rahe hai ...,0,To determine whether an 'Input' is sarcastic o...,1
521,@SGanguly99 sir aapnio jodi aamar saath na den...,0,Based on the patterns and tone of the previous...,1
522,#Irony: Pran ka pran chala gaya.. RIP Sher Khan!,1,To determine whether an 'Input' is sarcastic o...,1
523,Ye bhi mat bhulo ki triple talaq ko ban lagane...,0,"Based on the provided inputs and responses, he...",1
