### Loading data

In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# humor_df = pd.read_csv("/data1/debajyoti/test/llms/data/1Humour_Codemix.csv") # code-mixed humor dataset
# sarcasm_df = pd.read_csv("/data1/debajyoti/test/llms/data/1Sarcasm_Codemix.csv") # code-mixed sarcasm dataset

# def stratified_split(df):   # dataframe (df) is required as argument
#     X, y = list(df['Sentence']), list(df['Tag'])    # X: data, y: labels
    
#     # 80(train), 10(val), 10(test) split of the data
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.20, stratify=y, random_state=42)
#     X_val, X_test, y_val, y_test = train_test_split(
#         X_test, y_test, test_size=0.50, stratify=y_test, random_state=42)
#     cols = {"Sentence" : X_train, "Tag" : y_train}
#     train_df = pd.DataFrame(cols)   # train dataframe
#     cols = {"Sentence" : X_val, "Tag" : y_val}
#     val_df = pd.DataFrame(cols)     # validation dataframe
#     cols = {"Sentence" : X_test, "Tag" : y_test}
#     test_df = pd.DataFrame(cols)    # test dataframe
    
#     return train_df, val_df, test_df

In [2]:
# train_df, val_df, test_df = stratified_split(sarcasm_df)  # function call

# # save the stratified-splits in csv
# train_df.to_csv('/data1/debajyoti/test/llms/data/sarcasm/train.csv', index=False)
# val_df.to_csv('/data1/debajyoti/test/llms/data/sarcasm/val.csv', index=False)
# test_df.to_csv('/data1/debajyoti/test/llms/data/sarcasm/test.csv', index=False)

In [3]:
# train_df_eng = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/1Humour_English(NEW).csv")
# train_df_eng = train_df_eng[:2000]
# train_df_hin = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/1Humour_English(NEW).csv")
# train_df_hin = train_df_hin[:2000]
# train_df = pd.concat([train_df_eng, train_df_hin], ignore_index=True)
# train_df = train_df_hin
train_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/train.csv")
val_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/val.csv")
test_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/test.csv")

test_df

Unnamed: 0,Sentence,Tag
0,Is 2 takiye ki naukri main mera lakhon ka worl...,0
1,"""Naam : Viv Richards\nBaap ka naam : Master Di...",1
2,Sakshi Maharaj is BJP's Digvijaya Singh.,1
3,Aaj @British_Airways ki toh lag gayi bhai.\nHa...,1
4,Ghanta development hoga! Saare paise to electi...,1
...,...,...
291,"Hum to Aam Aadmi hain ji, woh Megalomaniac hai...",1
292,Neend nahi aati hai raaton mein?,0
293,Ghar mein Bipasha Basu ki photo rakhne se bhoo...,1
294,Aur dikhao aur dikhaopic.twitter.com/Ij4R6xNg9V,0


In [4]:
train_df

Unnamed: 0,Sentence,Tag
0,"Jyotiraditya Scindia is like ""Rassi jal gayee ...",1
1,Ishant Sharma ko bahut late utaara.,1
2,.@twinitisha neeche plug nikla hua hai..,0
3,Aaj agar India final me hota to kam se kam New...,0
4,3 stages of life of Mechanical Engineer:\n\n1)...,1
...,...,...
2355,Nitish & Lalu scared of Modi so much that agar...,1
2356,.@ShirishKunder Sabzi bana ke rakhna nahi to F...,1
2357,"""BARKHA se bacha lu tujhe seene se laga lu, aa...",0
2358,"""Bol raha hai mere saath debate karega.. Muh k...",0


In [5]:
train_df['Tag'] = train_df['Tag'].map({0: 'Non-humor', 1: 'Humor'})
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df

Unnamed: 0,Sentence,Tag
0,.@BJP4India se koi umeed na rakhe.. In chutiyo...,Humor
1,Mere ghar ka khaana sabse best hai. Food blogg...,Humor
2,Ab dekhna ye hai @aloknath ji Tara ka kanyadaa...,Non-humor
3,Bhai kuch kal ke liye bhi chhod de..\n\nRT @Ec...,Non-humor
4,Chetan 'English ka Mast Ram' Bhagat \n\n#Celeb...,Humor
...,...,...
2355,.@OfficeOfRG bhai Dilli se to 1AC me gaye ho.....,Non-humor
2356,.@SRKswarrior1 bas bata raha hu intolerance ky...,Non-humor
2357,Kal gaadi ke niche a gaya. Kismat achi thi ki ...,Humor
2358,Chitrahar mein bijlee udd gayi,Humor


### Load model

In [10]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": """You are a humor recognition assistant judging if an 'Input' is humorous or not. No need to consider any prior context.
    If the 'Input' is humorous, you need to output your final 'Output' as 'Humor'. 
    If the 'Input' is non-humorous, you need to output your final 'Output' as 'Non-humor'."""},
    {"role": "user", "content": "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi chaltapic.twitter.com/bOLAEk0g4E"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': 'Humor.'}


### Choosing few-shots for prompting

In [15]:
import pandas as pd
import torch
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
import numpy as np

# Step 1: Load mBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

# Function to generate embeddings using mBERT with batching
def get_embeddings(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move embeddings to CPU and store them
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(embeddings)
    
    # Concatenate all batch embeddings
    return np.concatenate(all_embeddings, axis=0)

# Generate embeddings for the dataset with batching
embeddings = get_embeddings(train_df['Sentence'].tolist(), batch_size=32)

# Step 2: Clustering
num_clusters = 4  # Define number of clusters (adjust based on dataset size)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Step 3: Add cluster labels to the DataFrame
train_df['cluster'] = clusters

# Step 4: Select the example closest to each cluster centroid for both labels
few_shot_examples = []

for cluster in range(num_clusters):
    cluster_data = train_df[train_df['cluster'] == cluster]
    
    # Compute distances to the centroid
    cluster_embeddings = embeddings[cluster_data.index]
    centroid = kmeans.cluster_centers_[cluster]
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    
    # Select the example closest to the centroid for each label
    if not cluster_data[cluster_data['Tag'] == 'Humor'].empty:
        humor_idx = cluster_data[cluster_data['Tag'] == 'Humor'].index[np.argmin(distances[cluster_data['Tag'] == 'Humor'])]
        print(humor_idx)
        few_shot_examples.append(train_df.loc[humor_idx])
        
    if not cluster_data[cluster_data['Tag'] == 'Non-humor'].empty:
        non_humor_idx = cluster_data[cluster_data['Tag'] == 'Non-humor'].index[np.argmin(distances[cluster_data['Tag'] == 'Non-humor'])]
        few_shot_examples.append(train_df.loc[non_humor_idx])

# Convert the selected examples into a DataFrame
few_shot_data = pd.DataFrame(few_shot_examples)

# Display the selected few-shot examples
print(few_shot_data[['Sentence', 'Tag', 'cluster']])

2126
1021
462
1302
                                               Sentence        Tag  cluster
2126  TL pe koi adarsh liberal jisko patakho se dikk...      Humor        0
2212  Chyamaila Hou de kharcha #Gudipadwapic.twitter...  Non-humor        0
1021  AAP walon ke ye haal hai ki agar Modi pe joke ...      Humor        1
718   1k rt tweets ko bhi aaj kal 5-7 rts milte hai....  Non-humor        1
462   Ab jake mera tweet padha bhai ne. Sala itni de...      Humor        2
298   Baa ji @BDUTT main to bolta hoon Indrani ki ja...  Non-humor        2
1302  Bhai @imVkohli ye century pe century lagakar a...      Humor        3
1704  Kabse iss pyaase jhagadte TL par, sense ki ek ...  Non-humor        3


In [16]:
# Arrange few-shot examples in the desired format
formatted_few_shot_examples = []
for row in few_shot_examples:
    formatted_few_shot_examples.append({"role": "user", "content": row['Sentence']})
    formatted_few_shot_examples.append({"role": "assistant", "content": row['Tag']})

# Display the formatted few-shot examples
for example in formatted_few_shot_examples:
    print(example)

{'role': 'user', 'content': 'TL pe koi adarsh liberal jisko patakho se dikkat ho.. Ye bomb apni gaand me bhar le.. Batti mein laga doongapic.twitter.com/nva3UI1BHD'}
{'role': 'assistant', 'content': 'Humor'}
{'role': 'user', 'content': 'Chyamaila Hou de kharcha #Gudipadwapic.twitter.com/nNKEg1O7Ek'}
{'role': 'assistant', 'content': 'Non-humor'}
{'role': 'user', 'content': 'AAP walon ke ye haal hai ki agar Modi pe joke maro to bhi unhe tarif hi lagti hai.'}
{'role': 'assistant', 'content': 'Humor'}
{'role': 'user', 'content': '1k rt tweets ko bhi aaj kal 5-7 rts milte hai. Kya zamana aa gaya hai.'}
{'role': 'assistant', 'content': 'Non-humor'}
{'role': 'user', 'content': 'Ab jake mera tweet padha bhai ne. Sala itni der se anushka sharma ki TL pe jasoosi kar raha tha !!'}
{'role': 'assistant', 'content': 'Humor'}
{'role': 'user', 'content': 'Baa ji @BDUTT main to bolta hoon Indrani ki jagah Siddhrtha par muder ka case chalaya jaye. Kya bolti ho?'}
{'role': 'assistant', 'content': 'Non-hu

In [17]:
## Store the few shots in a .csv

import csv

# Initialize a list to hold Sentence-Tag pairs
sentence_tag_pairs = []

# Iterate through the few_shot_examples to pair user and assistant content
for i in range(0, len(formatted_few_shot_examples), 2):
    sentence = formatted_few_shot_examples[i]["content"]
    tag = formatted_few_shot_examples[i + 1]["content"]
    sentence_tag_pairs.append({"Sentence": sentence, "Tag": tag})

# Define the CSV file path
csv_file_path = "/data1/debajyoti/code-mix-humor-sarcasm-detection/outputs/humor/few_shots/cm.csv"

# Write the Sentence-Tag pairs to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Sentence", "Tag"])
    writer.writeheader()  # Write the header
    writer.writerows(sentence_tag_pairs)  # Write the rows

: 

In [6]:
df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/outputs/humor/few_shots/cm.csv")
df

Unnamed: 0,Sentence,Tag
0,TL pe koi adarsh liberal jisko patakho se dikk...,Humor
1,Chyamaila Hou de kharcha #Gudipadwapic.twitter...,Non-humor
2,AAP walon ke ye haal hai ki agar Modi pe joke ...,Humor
3,1k rt tweets ko bhi aaj kal 5-7 rts milte hai....,Non-humor
4,Ab jake mera tweet padha bhai ne. Sala itni de...,Humor
5,Baa ji @BDUTT main to bolta hoon Indrani ki ja...,Non-humor
6,Bhai @imVkohli ye century pe century lagakar a...,Humor
7,"Kabse iss pyaase jhagadte TL par, sense ki ek ...",Non-humor


In [7]:
# Convert dataframe to specified format
formatted_few_shot_examples = []
for _, row in df.iterrows():
    formatted_few_shot_examples.append({'role': 'user', 'content': row['Sentence']})
    formatted_few_shot_examples.append({'role': 'assistant', 'content': row['Tag']})
    
formatted_few_shot_examples

[{'role': 'user',
  'content': 'TL pe koi adarsh liberal jisko patakho se dikkat ho.. Ye bomb apni gaand me bhar le.. Batti mein laga doongapic.twitter.com/nva3UI1BHD'},
 {'role': 'assistant', 'content': 'Humor'},
 {'role': 'user',
  'content': 'Chyamaila Hou de kharcha #Gudipadwapic.twitter.com/nNKEg1O7Ek'},
 {'role': 'assistant', 'content': 'Non-humor'},
 {'role': 'user',
  'content': 'AAP walon ke ye haal hai ki agar Modi pe joke maro to bhi unhe tarif hi lagti hai.'},
 {'role': 'assistant', 'content': 'Humor'},
 {'role': 'user',
  'content': '1k rt tweets ko bhi aaj kal 5-7 rts milte hai. Kya zamana aa gaya hai.'},
 {'role': 'assistant', 'content': 'Non-humor'},
 {'role': 'user',
  'content': 'Ab jake mera tweet padha bhai ne. Sala itni der se anushka sharma ki TL pe jasoosi kar raha tha !!'},
 {'role': 'assistant', 'content': 'Humor'},
 {'role': 'user',
  'content': 'Baa ji @BDUTT main to bolta hoon Indrani ki jagah Siddhrtha par muder ka case chalaya jaye. Kya bolti ho?'},
 {'rol

### Create prompt, parse response and  generate response

In [11]:
from transformers import pipeline
import re
import json

# Define the function to create the prompt
def create_prompt(input, num_examples):

    # System prompt 
    system_prompt = {
        "role": "system",
        "content": """You are a humor recognition assistant judging if an 'Input' is sarcastic or not.
        If the 'Input' is sarcastic, you need to give your final 'Output' as 'Humor'. 
        If the 'Input' is non-sarcastic, you need to give your final 'Output' as 'Non-humor'."""
    }

    # # System prompt 
    # system_prompt = {
    #     "role": "system",
    #     "content": """Classify as 'Humor' or 'Non-humor'."""
    # }
    
    
    # # Few-shot examples 
    # few_shot_examples = [
    #     {"role": "user", "content": "Why did the scarecrow win an award? Because he was outstanding in his field."},
    #     {"role": "assistant", "content": "Humor"},
    #     {"role": "user", "content": "This is a straightforward business email with no jokes."},
    #     {"role": "assistant", "content": "Non-humor"},
    # ]

    messages = [system_prompt] + formatted_few_shot_examples + [{"role": "user", "content": input}]

    return messages

def parse_response(response):
    """
    Extracts the classification label and explanation from the model-generated response,
    starting from the line containing the final "Answer:".

    Args:
        response (str): The response generated by the model.

    Returns:
        tuple: A tuple containing the classification label ("Humor" or "Non-humor") 
               and the explanation text.
    """
    # print(response)
    if len(response) > 0:        
        if "Non-humor" in response:
            return "0"
        elif "Humor" in response:
            return "1"
        else:# If no label is found, return None
            return None

def generate_responses(prompt, pipeline):

    response = pipeline(prompt, max_new_tokens=256)[0]["generated_text"][-1]['content']
    return response

In [12]:
def process_data(output_filepath):
    # df = pd.read_csv(data_filepath)
    df = test_df
    # df = df[:5]
    
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )
    
    results = []
    num_examples = 15 # number of shots in few-shot
    # Loop over the DataFrame and pass the text to generate response
    for index, row in tqdm(list(df.iterrows())):
        prompt = create_prompt(row['Sentence'], num_examples)
        model_response = generate_responses(prompt, pipeline)
        # print(model_response)
        generated_label = parse_response(model_response)

        # veracity, explanation = parse_response(model_response)
        
        results.append({
            'Sentence': row['Sentence'],
            'Label': row['Tag'],
            'Response': model_response,
            'Gen_label': generated_label,
            # 'Explanation': explanation
        })
        # break
    
    results = pd.DataFrame(results)
    
    return results

# Example file paths, replace with your actual paths
# data_filepath = '/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/test.csv'
output_filepath = '/data1/debajyoti/test/llms/data/humor'

# Run the data processing
result = process_data(output_filepath)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/296 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  0%|          | 1/296 [00:00<00:32,  8.98it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 2/296 [00:00<00:31,  9.25it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 3/296 [00:00<00:31,  9.26it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|▏         | 4/296 [00:00<00:31,  9.36it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 5/296 [00:00<00:30,  9.45it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 6/296 [00:00<00:30,  9.50it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 7/296 [00:00<00:30,  9.55it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 8/296 [00:00<00:30,  9.55it/s]Setting `pad_toke

In [13]:
result

Unnamed: 0,Sentence,Label,Response,Gen_label
0,Is 2 takiye ki naukri main mera lakhon ka worl...,0,Humor,1
1,"""Naam : Viv Richards\nBaap ka naam : Master Di...",1,Humor,1
2,Sakshi Maharaj is BJP's Digvijaya Singh.,1,Humor,1
3,Aaj @British_Airways ki toh lag gayi bhai.\nHa...,1,Humor,1
4,Ghanta development hoga! Saare paise to electi...,1,Humor,1
...,...,...,...,...
291,"Hum to Aam Aadmi hain ji, woh Megalomaniac hai...",1,Humor,1
292,Neend nahi aati hai raaton mein?,0,Non-humor,0
293,Ghar mein Bipasha Basu ki photo rakhne se bhoo...,1,Humor,1
294,Aur dikhao aur dikhaopic.twitter.com/Ij4R6xNg9V,0,Humor,1
