In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,TrainingArguments
from peft import LoraConfig
import torch
from datasets import load_dataset
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
USE_GPU = True
if USE_GPU:
    device = "cuda:0"
else:
    device = "cpu"

# you may want to change the following parameters depending on your GPU configuration

# free T4 instance
# QUANTIZE_4BIT = True
# USE_GRAD_CHECKPOINTING = True
# TRAIN_BATCH_SIZE = 2
# TRAIN_MAX_SEQ_LENGTH = 512
# USE_FLASH_ATTENTION = False
# GRAD_ACC_STEPS = 16

# equivalent A100 setting
QUANTIZE_4BIT = True
USE_GRAD_CHECKPOINTING = True
TRAIN_BATCH_SIZE = 16
TRAIN_MAX_SEQ_LENGTH = 512
USE_FLASH_ATTENTION = True
GRAD_ACC_STEPS = 2

In [3]:
MODEL_NAME = "CohereForAI/aya-expanse-8b"
models_path = "/data1/debajyoti/test/llms/models/"

attn_implementation = None
if USE_FLASH_ATTENTION:
  attn_implementation="flash_attention_2"

model = AutoModelForCausalLM.from_pretrained(
          MODEL_NAME,
          attn_implementation=attn_implementation,
          torch_dtype=torch.bfloat16,
          cache_dir=models_path,
          device_map="auto"
        )
# model = model.to(device)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it]


In [30]:
def get_message_format(prompts):
  messages = []

  for p in prompts:
    messages.append(
          [{"role": "system", "content": """You are a humor recognition assistant judging if an 'Input' is humorous or not. No need to consider any prior context.
            If the 'Input' is humorous, you need to output your final 'Output' as 'Humor'. 
            If the 'Input' is non-humorous, you need to output your final 'Output' as 'Non-humor'."""},
          {"role": "user", "content": p}]
      )

  return messages

def generate_aya(
      model,
      prompts,
      temperature=0.75,
      top_p=1.0,
      top_k=0,
      max_new_tokens=1024
    ):

  # print(prompts)
  messages = get_message_format(prompts)
  # print(messages)

  input_ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        padding=True,
        return_tensors="pt",
      )
  input_ids = input_ids.to(model.device)
  prompt_padded_len = len(input_ids[0])

  gen_tokens = model.generate(
        input_ids,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        max_new_tokens=max_new_tokens,
        do_sample=True,
      )

  # get only generated tokens
  gen_tokens = [
      gt[prompt_padded_len:] for gt in gen_tokens
    ]

  gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
  return gen_text

In [27]:
# Test generations on langauges in Aya 23 set
prompts = [
    """Input: "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi" 
    Output: """
]

generations = generate_aya(model, prompts)

for p, g in zip(prompts, generations):
  print(
      "PROMPT", p ,"RESPONSE", g, "\n", sep="\n"
    )

['Input: "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi" \n    Output: ']
[[{'role': 'system', 'content': "You are a humor recognition assistant judging if an 'Input' is humorous or not. No need to consider any prior context.\n            If the 'Input' is humorous, you need to output your final 'Output' as 'Humor'. \n            If the 'Input' is non-humorous, you need to output your final 'Output' as 'Non-humor'."}, {'role': 'user', 'content': 'Input: "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi" \n    Output: '}]]
PROMPT
Input: "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi" 
    Output: 
RESPONSE
Output: Humor

The statement plays on the idea of a hypothetical scenario where the Vice President's actions or decisions might have been different if they were not Muslim, implying a bias or lack of understanding of their own role due to r

In [6]:
import pandas as pd
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# train_df_eng = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/1Humour_English(NEW).csv")
# train_df_eng = train_df_eng[:2000]
# train_df_hin = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/1Humour_English(NEW).csv")
# train_df_hin = train_df_hin[:2000]
# train_df = pd.concat([train_df_eng, train_df_hin], ignore_index=True)
# train_df = train_df_hin
train_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/train.csv")
val_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/val.csv")
test_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/test.csv")

test_df

Unnamed: 0,Sentence,Tag
0,Is 2 takiye ki naukri main mera lakhon ka worl...,0
1,"""Naam : Viv Richards\nBaap ka naam : Master Di...",1
2,Sakshi Maharaj is BJP's Digvijaya Singh.,1
3,Aaj @British_Airways ki toh lag gayi bhai.\nHa...,1
4,Ghanta development hoga! Saare paise to electi...,1
...,...,...
291,"Hum to Aam Aadmi hain ji, woh Megalomaniac hai...",1
292,Neend nahi aati hai raaton mein?,0
293,Ghar mein Bipasha Basu ki photo rakhne se bhoo...,1
294,Aur dikhao aur dikhaopic.twitter.com/Ij4R6xNg9V,0


In [8]:
df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/outputs/humor/few_shots/cm.csv")
df

Unnamed: 0,Sentence,Tag
0,TL pe koi adarsh liberal jisko patakho se dikk...,Humor
1,Chyamaila Hou de kharcha #Gudipadwapic.twitter...,Non-humor
2,AAP walon ke ye haal hai ki agar Modi pe joke ...,Humor
3,1k rt tweets ko bhi aaj kal 5-7 rts milte hai....,Non-humor
4,Ab jake mera tweet padha bhai ne. Sala itni de...,Humor
5,Baa ji @BDUTT main to bolta hoon Indrani ki ja...,Non-humor
6,Bhai @imVkohli ye century pe century lagakar a...,Humor
7,"Kabse iss pyaase jhagadte TL par, sense ki ek ...",Non-humor


In [9]:
# Convert dataframe to specified format
formatted_few_shot_examples = []
for _, row in df.iterrows():
    formatted_few_shot_examples.append({'role': 'user', 'content': row['Sentence']})
    formatted_few_shot_examples.append({'role': 'assistant', 'content': row['Tag']})
    
formatted_few_shot_examples

[{'role': 'user',
  'content': 'TL pe koi adarsh liberal jisko patakho se dikkat ho.. Ye bomb apni gaand me bhar le.. Batti mein laga doongapic.twitter.com/nva3UI1BHD'},
 {'role': 'assistant', 'content': 'Humor'},
 {'role': 'user',
  'content': 'Chyamaila Hou de kharcha #Gudipadwapic.twitter.com/nNKEg1O7Ek'},
 {'role': 'assistant', 'content': 'Non-humor'},
 {'role': 'user',
  'content': 'AAP walon ke ye haal hai ki agar Modi pe joke maro to bhi unhe tarif hi lagti hai.'},
 {'role': 'assistant', 'content': 'Humor'},
 {'role': 'user',
  'content': '1k rt tweets ko bhi aaj kal 5-7 rts milte hai. Kya zamana aa gaya hai.'},
 {'role': 'assistant', 'content': 'Non-humor'},
 {'role': 'user',
  'content': 'Ab jake mera tweet padha bhai ne. Sala itni der se anushka sharma ki TL pe jasoosi kar raha tha !!'},
 {'role': 'assistant', 'content': 'Humor'},
 {'role': 'user',
  'content': 'Baa ji @BDUTT main to bolta hoon Indrani ki jagah Siddhrtha par muder ka case chalaya jaye. Kya bolti ho?'},
 {'rol

In [20]:
formatted_few_shot_examples = ""
for index, row in df.iterrows():
    if index == 3: # or index == 4:
        formatted_few_shot_examples += f"""'Input': "{row['Sentence']}"\n'Response':"{row['Tag']}"\n\n"""
    # formatted_few_shot_examples += f"""'Input': "{row['Sentence']}"\n'Response':"{row['Tag']}"\n\n"""
    # if index == 1:
    #     break

# Display the result
print(formatted_few_shot_examples)

'Input': "1k rt tweets ko bhi aaj kal 5-7 rts milte hai. Kya zamana aa gaya hai."
'Response':"Non-humor"




### Create prompt, parse response and  generate response

In [21]:
from transformers import pipeline
import re
import json

# Define the function to create the prompt
def create_prompt(input, num_examples):

    # System prompt 
    # system_prompt = """You are a humor recognition assistant judging if an 'Input' is humor or not. 
    #     If the 'Input' is humorous, you need to give your final 'Output' as "Humor". 
    #     If the 'Input' is non-humorous, you need to give your final 'Output' as "Non-humor". Dont output anything else. """

    # messages = [system_prompt + formatted_few_shot_examples, f"\'Input\':'{input}'\n\'Response\':"]
    messages = f"""
                {formatted_few_shot_examples}
                \'Input\':'{input}'
                \'Response\':
    """
    # messages = [system_prompt, input]

    return messages

def parse_response(response):
    """
    Extracts the classification label and explanation from the model-generated response,
    starting from the line containing the final "Answer:".

    Args:
        response (str): The response generated by the model.

    Returns:
        tuple: A tuple containing the classification label ("Humor" or "Non-humor") 
               and the explanation text.
    """
    # print(response)
    if len(response) > 0:        
        if "Non-humor" in response:
            return "0"
        elif "Humor" in response:
            return "1"
        else:# If no label is found, return None
            return None


In [59]:
def process_data(output_filepath):
    # df = pd.read_csv(data_filepath)
    df = test_df
    # print(df)
    # df = df[:5]
    
    
    results = []
    num_examples = 15 # number of shots in few-shot
    # Loop over the DataFrame and pass the text to generate response
    for index, row in tqdm(list(df.iterrows())):
        # print(row)
        prompt = create_prompt(row['Sentence'], num_examples)
        print(prompt)
        generations = generate_aya(model, [prompt])
        print(generations[0])
        generated_label = parse_response(generations[0])
        # print(generated_label)

        # # veracity, explanation = parse_response(model_response)
        
        results.append({
            'Sentence': row['Sentence'],
            'Label': row['Tag'],
            'Response': generations[0],
            'Gen_label': generated_label,
            # 'Explanation': explanation
        })
        # break
    
    results = pd.DataFrame(results)
    
    return results

# Example file paths, replace with your actual paths
output_filepath = '/data1/debajyoti/test/llms/data/humor'

# Run the data processing
result = process_data(output_filepath)

  0%|          | 0/296 [00:00<?, ?it/s]


                'Input': "1k rt tweets ko bhi aaj kal 5-7 rts milte hai. Kya zamana aa gaya hai."
'Response':"Non-humor"


                'Input':'Is 2 takiye ki naukri main mera lakhon ka world cup jaaye'
                'Response':
    


  0%|          | 0/296 [00:05<?, ?it/s]

'Humor'

दूसरे इनपुट में "इस 2 ताकीय की नौकरी में मेरा लाखों का वर्ल्ड कप जाए" वाक्यांश एक हास्यप्रद तुलना पेश करता है, जहां किसी का सपना या दावा बहुत बड़ा और असंभव लगता है, जैसे कि एक साधारण 2 ताकीय (यानी छोटी नौकरी) से लाखों का वर्ल्ड कप (एक बड़ी उपलब्धि) जीतना। इस तरह की तुलना और अतिशयोक्ति से हास्य उत्पन्न होता है।





In [60]:
result

Unnamed: 0,Sentence,Label,Response,Gen_label
0,Is 2 takiye ki naukri main mera lakhon ka worl...,0,"'Humor'\n\nदूसरे इनपुट में ""इस 2 ताकीय की नौकर...",1


In [61]:
result['Gen_label'].value_counts()

Gen_label
1    1
Name: count, dtype: int64

In [67]:
from pydantic import BaseModel, validator
import re

class ModelResponse(BaseModel):
    instruction: str
    input_text: str
    output_label: str

    # @validator('instruction', pre=True, always=True)
    # def extract_instruction(cls, value, values):
    #     # Extract the Instruction part
    #     match = re.search(r'### Instruction:\n(.+?)\n\n### Input:', value, re.DOTALL)
    #     return match.group(1).strip() if match else ""

    # @validator('input_text', pre=True, always=True)
    # def extract_input_text(cls, value, values):
    #     # Extract the Input part
    #     match = re.search(r'### Input:\n(.+?)\n\n### Response:', value, re.DOTALL)
    #     return match.group(1).strip() if match else ""

    @validator('output_label', pre=True, always=True)
    def extract_output_label(cls, value, values):
        # Extract the Response part
        match = re.search(r"^\s*['\"](.*?)['\"]", value)
        return match.group(1).strip() if match else ""

# Example usage
response_text = ("""'Humor' दूसरे इनपुट में "2 ताकीय की नौकरी में मेरा लाखों का वर्ल्ड कप जाए" वाक्यांश एक हास्यास्पद तुलना पेश करता है, जो एक साधारण नौकरी की तुलना एक बड़े खेल प्रतियोगिता जैसे वर्ल्ड कप से करता है। यह व्यंग्यात्मक और हास्यप्रद है, जिससे इसे हास्य के रूप में पहचाना जा सकता है।"""
)

# Create ModelResponse object
parsed_response = ModelResponse(instruction=response_text, input_text=response_text, output_label=response_text)

# Access parsed fields
# print("Instruction:", parsed_response.instruction)
# print("Input Text:", parsed_response.input_text)
print("Output Label:", parsed_response.output_label)

Output Label: Humor


/tmp/ipykernel_1174741/1620833736.py:21: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  @validator('output_label', pre=True, always=True)


In [65]:
import re

def extract_label(value):
    # Adjust regex to allow optional leading whitespace
    match = re.search(r"^\s*['\"](.*?)['\"]", value)
    return match.group(1).strip() if match else ""

# Example input
value = """    'Humor' दूसरे इनपुट में "2 ताकीय की नौकरी में मेरा लाखों का वर्ल्ड कप जाए" वाक्यांश एक हास्यास्पद तुलना पेश करता है, जो एक साधारण नौकरी की तुलना एक बड़े खेल प्रतियोगिता जैसे वर्ल्ड कप से करता है। यह व्यंग्यात्मक और हास्यप्रद है, जिससे इसे हास्य के रूप में पहचाना जा सकता है।"""

label = extract_label(value)
print(label)

Humor


In [68]:
for index, row in tqdm(list(result.iterrows())):
    parsed_response = ModelResponse(instruction=row['Response'][0], input_text=row['Response'][0], output_label=row['Response'][0])

    # Access parsed fields
    print("Output Label:", parsed_response.output_label)
    # result['Gen_label'][index] = parsed_response.output_label
    # print(result['Gen_label'][index])

100%|██████████| 1/1 [00:00<00:00, 2970.47it/s]

Output Label: 





In [69]:
result['Gen_label'].value_counts()

Gen_label
1    1
Name: count, dtype: int64

In [None]:
y_true = list(map(int, new_result['Label']))
y_pred = list(map(int, new_result['Gen_label']))
print(classification_report(y_true, y_pred, digits=6))