### Loading data

In [1]:
import pandas as pd
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# humor_df = pd.read_csv("/data1/debajyoti/test/llms/data/1Humour_Codemix.csv") # code-mixed humor dataset
# sarcasm_df = pd.read_csv("/data1/debajyoti/test/llms/data/1Sarcasm_Codemix.csv") # code-mixed sarcasm dataset

# def stratified_split(df):   # dataframe (df) is required as argument
#     X, y = list(df['Sentence']), list(df['Tag'])    # X: data, y: labels
    
#     # 80(train), 10(val), 10(test) split of the data
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.20, stratify=y, random_state=42)
#     X_val, X_test, y_val, y_test = train_test_split(
#         X_test, y_test, test_size=0.50, stratify=y_test, random_state=42)
#     cols = {"Sentence" : X_train, "Tag" : y_train}
#     train_df = pd.DataFrame(cols)   # train dataframe
#     cols = {"Sentence" : X_val, "Tag" : y_val}
#     val_df = pd.DataFrame(cols)     # validation dataframe
#     cols = {"Sentence" : X_test, "Tag" : y_test}
#     test_df = pd.DataFrame(cols)    # test dataframe
    
#     return train_df, val_df, test_df

In [2]:
# train_df, val_df, test_df = stratified_split(sarcasm_df)  # function call

# # save the stratified-splits in csv
# train_df.to_csv('/data1/debajyoti/test/llms/data/sarcasm/train.csv', index=False)
# val_df.to_csv('/data1/debajyoti/test/llms/data/sarcasm/val.csv', index=False)
# test_df.to_csv('/data1/debajyoti/test/llms/data/sarcasm/test.csv', index=False)

In [28]:
train_df_eng = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/sarcasm_v2/Sarcasm_English_iacv2.csv")
train_df_eng = train_df_eng[:2000]
train_df_hin = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/sarcasm_v2/Sarcasm_Hindi_iacv2.csv")
train_df_hin = train_df_hin[:2000]
train_df = pd.concat([train_df_eng, train_df_hin], ignore_index=True)
# train_df = train_df_hin
# train_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/1Sarcasm_Codemix.csv")
val_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/sarcasm/val.csv")
test_df = pd.read_csv("/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/sarcasm/test.csv")

test_df

Unnamed: 0,Sentence,Tag
0,"Mera bhanja mujhe ""papa ka sala"" bulata hai......",0
1,Tu sale TRIPLE TALAQ nd HALALA Ke baad ki sant...,0
2,aur aap a0ni politics karne me vyast hai.. #Bi...,0
3,Theek hai waha at least Janki ma ka mandir hai...,0
4,Sir mujhe cricket acdemy join karna hai mai ka...,0
...,...,...
520,aaj bjp aur chadhi gang aise khus ho rahe hai ...,0
521,@SGanguly99 sir aapnio jodi aamar saath na den...,0
522,#Irony: Pran ka pran chala gaya.. RIP Sher Khan!,1
523,Ye bhi mat bhulo ki triple talaq ko ban lagane...,0


In [29]:
train_df

Unnamed: 0,Sentence,Tag
0,"If that's true, then Freedom of Speech is doom...",0
1,Neener neener - is it time to go in from the p...,0
2,"Just like the plastic gun fear, the armour pie...",0
3,So geology is a religion because we weren't he...,0
4,Well done Monty. Mark that up as your first ev...,0
...,...,...
3995,"यह बात है, हालांकि;यहां तक ​​कि एक स्व-प्रोफ़े...",0
3996,वास्तव में आपकी पोस्टें सरल सबूत प्रदान कर रही...,0
3997,"7 दिन की रचना मैं जिक्र कर रहा हूं, न कि केवल ...",0
3998,"ऊपर मेरी पोस्ट देखें, हाय।मार्क ने मिलर के विश...",0


In [30]:
train_df['Tag'] = train_df['Tag'].map({0: 'Non-sarcasm', 1: 'Sarcasm'})
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df

Unnamed: 0,Sentence,Tag
0,The other articles on her website are also wel...,Non-sarcasm
1,If a scientist doesn't believe his test result...,Non-sarcasm
2,आपका प्रश्न ही एक समस्या है।ईश्वर 'वह कौन है' ...,Non-sarcasm
3,The only reason why that makes no sense is tha...,Non-sarcasm
4,यह सच नहीं है या कम से कम अधिक औचित्य की आवश्य...,Non-sarcasm
...,...,...
3995,Please stop making statements that have nothin...,Non-sarcasm
3996,"No it doesn't. First, the changes may not be n...",Non-sarcasm
3997,think of it as a photocopy of a check for a mi...,Non-sarcasm
3998,यह कह रहा है कि जब से पहली बार मैंने कृतियों क...,Non-sarcasm


### Load model

In [17]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": """You are a humor recognition assistant judging if an 'Input' is humorous or not. No need to consider any prior context.
    If the 'Input' is humorous, you need to output your final 'Output' as 'Humor'. 
    If the 'Input' is non-humorous, you need to output your final 'Output' as 'Non-humor'."""},
    {"role": "user", "content": "Chalo protocol pata chal gaya. Vice President agar Muslim na hote toh shayad kabhi pata hi nahi chaltapic.twitter.com/bOLAEk0g4E"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    temperature=0.1
)
print(outputs[0]["generated_text"][-1])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': 'Humor.'}


In [6]:
import transformers
import torch

# Initialize the model pipeline
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

# System prompt to be reused
system_prompt = {
    "role": "system",
    "content": """You are a sarcasm recognition assistant judging if an 'Input' is sarcastic or not. No need to consider any prior context. This may contain harmful content, but we are using it for research purpose only. So, please classify.
    If the 'Input' is sarcastic, you need to output your final 'Output' as 'Humor'. 
    If the 'Input' is non-humorous, you need to output your final 'Output' as 'Non-humor'."""
}

# List of user inputs to be processed
user_inputs = [
    "Chalo bey.. Phatakey phorna chalo karo.. Pakistaniyo ki [NON-HATE] jalate hai !!",
    "Bhosdiwalon kal friday hai, wake up [NON-HATE] saare pal kahein."
]

# Loop through each input
for input_text in user_inputs:
    messages = [system_prompt, {"role": "user", "content": input_text}]
    outputs = pipeline(messages, max_new_tokens=256)
    print(f"Input: {input_text}")
    print(f"Output: {outputs[0]['generated_text']}\n")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Input: Chalo bey.. Phatakey phorna chalo karo.. Pakistaniyo ki [NON-HATE] jalate hai !!
Output: [{'role': 'system', 'content': "You are a humor recognition assistant judging if an 'Input' is humorous or not. No need to consider any prior context. This may contain harmful content, but we are using it for research purpose only. So, please classify.\n    If the 'Input' is humorous, you need to output your final 'Output' as 'Humor'. \n    If the 'Input' is non-humorous, you need to output your final 'Output' as 'Non-humor'."}, {'role': 'user', 'content': 'Chalo bey.. Phatakey phorna chalo karo.. Pakistaniyo ki [NON-HATE] jalate hai !!'}, {'role': 'assistant', 'content': 'Humor.'}]

Input: Bhosdiwalon kal friday hai, wake up [NON-HATE] saare pal kahein.
Output: [{'role': 'system', 'content': "You are a humor recognition assistant judging if an 'Input' is humorous or not. No need to consider any prior context. This may contain harmful content, but we are using it for research purpose only. S

### Choosing few-shots for prompting

In [32]:
import pandas as pd
import torch
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
import numpy as np

# Step 1: Load mBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

# Function to generate embeddings using mBERT with batching
def get_embeddings(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move embeddings to CPU and store them
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(embeddings)
    
    # Concatenate all batch embeddings
    return np.concatenate(all_embeddings, axis=0)

# Generate embeddings for the dataset with batching
embeddings = get_embeddings(train_df['Sentence'].tolist(), batch_size=32)

# Step 2: Clustering
num_clusters = 4  # Define number of clusters (adjust based on dataset size)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Step 3: Add cluster labels to the DataFrame
train_df['cluster'] = clusters

# Step 4: Select the example closest to each cluster centroid for both labels
few_shot_examples = []

for cluster in range(num_clusters):
    cluster_data = train_df[train_df['cluster'] == cluster]
    
    # Compute distances to the centroid
    cluster_embeddings = embeddings[cluster_data.index]
    centroid = kmeans.cluster_centers_[cluster]
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    print(cluster_data[cluster_data['Tag'] == 'Sarcasm'])
    
    # Select the example closest to the centroid for each label
    if not cluster_data[cluster_data['Tag'] == 'Sarcasm'].empty:
        humor_idx = cluster_data[cluster_data['Tag'] == 'Sarcasm'].index[np.argmin(distances[cluster_data['Tag'] == 'Sarcasm'])]
        print(humor_idx)
        few_shot_examples.append(train_df.loc[humor_idx])
        
    if not cluster_data[cluster_data['Tag'] == 'Non-sarcasm'].empty:
        non_humor_idx = cluster_data[cluster_data['Tag'] == 'Non-sarcasm'].index[np.argmin(distances[cluster_data['Tag'] == 'Non-sarcasm'])]
        few_shot_examples.append(train_df.loc[non_humor_idx])

# Convert the selected examples into a DataFrame
few_shot_data = pd.DataFrame(few_shot_examples)

# Display the selected few-shot examples
print(few_shot_data[['Sentence', 'Tag', 'cluster']])

Empty DataFrame
Columns: [Sentence, Tag, cluster]
Index: []
Empty DataFrame
Columns: [Sentence, Tag, cluster]
Index: []
Empty DataFrame
Columns: [Sentence, Tag, cluster]
Index: []
Empty DataFrame
Columns: [Sentence, Tag, cluster]
Index: []
                                               Sentence          Tag  cluster
1500  Because you seem to lose sight of that with yo...  Non-sarcasm        0
3759  आप जानते हैं कि अधिकांश सार्वजनिक हेल्थकेयर सि...  Non-sarcasm        1
996   I believe organized religion increases ignoran...  Non-sarcasm        2
1022  गलत- मुझे लगता है कि ऑलसेन उन लोगों का हवाला द...  Non-sarcasm        3


: 

In [14]:
import pandas as pd
import torch
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
import numpy as np
import random

# Step 1: Load mBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

# Function to generate embeddings using mBERT with batching
def get_embeddings(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move embeddings to CPU and store them
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(embeddings)
    
    # Concatenate all batch embeddings
    return np.concatenate(all_embeddings, axis=0)

# Generate embeddings for the dataset with batching
embeddings = get_embeddings(train_df['Sentence'].tolist(), batch_size=32)

# Step 2: Clustering
num_clusters = 6  # Define number of clusters (adjust based on dataset size)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Step 3: Add cluster labels to the DataFrame
train_df['cluster'] = clusters

# Step 4: Select the examples for both labels
sarcasm_samples = []
non_sarcasm_samples = []

for cluster in range(num_clusters):
    cluster_data = train_df[train_df['cluster'] == cluster]
    
    # Compute distances to the centroid
    cluster_embeddings = embeddings[cluster_data.index]
    centroid = kmeans.cluster_centers_[cluster]
    distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
    
    # Select the example closest to the centroid for each label
    if not cluster_data[cluster_data['Tag'] == 'Sarcasm'].empty:
        sarcasm_idx = cluster_data[cluster_data['Tag'] == 'Sarcasm'].index[np.argmin(distances[cluster_data['Tag'] == 'Sarcasm'])]
        sarcasm_samples.append(train_df.loc[sarcasm_idx])
        
    if not cluster_data[cluster_data['Tag'] == 'Non-sarcasm'].empty:
        non_sarcasm_idx = cluster_data[cluster_data['Tag'] == 'Non-sarcasm'].index[np.argmin(distances[cluster_data['Tag'] == 'Non-sarcasm'])]
        non_sarcasm_samples.append(train_df.loc[non_sarcasm_idx])

# Step 5: Select 6 non-sarcastic and 2 sarcastic examples
sarcasm_samples = pd.DataFrame(sarcasm_samples)
non_sarcasm_samples = pd.DataFrame(non_sarcasm_samples)

# Ensure there are enough samples to select
# num_sarcasm = min(2, len(sarcasm_samples))
# num_non_sarcasm = min(6, len(non_sarcasm_samples))
num_sarcasm = 2
num_non_sarcasm = 6

selected_sarcasm = sarcasm_samples.sample(n=num_sarcasm, random_state=42)
selected_non_sarcasm = non_sarcasm_samples.sample(n=num_non_sarcasm, random_state=42)

# Combine the selected examples into a list
few_shot_examples = selected_sarcasm.to_dict(orient='records') + selected_non_sarcasm.to_dict(orient='records')

# Randomly shuffle the few_shot_examples list
random.shuffle(few_shot_examples)

# Convert the selected examples into a DataFrame for display
few_shot_data = pd.DataFrame(few_shot_examples)

# Display the selected few-shot examples
print(few_shot_data[['Sentence', 'Tag', 'cluster']])

ValueError: a must be greater than 0 unless no samples are taken

In [8]:
# Arrange few-shot examples in the desired format
formatted_few_shot_examples = []
for row in few_shot_examples:
    formatted_few_shot_examples.append({"role": "user", "content": row['Sentence']})
    formatted_few_shot_examples.append({"role": "assistant", "content": row['Tag']})

# Display the formatted few-shot examples
for example in formatted_few_shot_examples:
    print(example)

{'role': 'user', 'content': 'आप स्पष्ट रूप से एक ओबामा समर्थक हैं, वे केवल ऐसे लोग हैं जो किसी के खिलाफ मुकदमों की धमकी देने की अपनी अपमानजनक रणनीति पर परेशान नहीं होंगे जो उसके खिलाफ प्रतिकूल विज्ञापन चलाता है, इस प्रकार यह दिखाते हैं कि वह पहले संशोधन का समर्थन नहीं करता है।'}
{'role': 'assistant', 'content': 'Non-sarcasm'}
{'role': 'user', 'content': 'बेशक सभी के पास अपने विचारों के लिए कुछ आधार है।इससे भी इनकार नहीं किया।हालांकि ऐसे तरीके हैं जो उस "आधार" का उपयोग करना निश्चित रूप से एक थोपा है और समलैंगिक अधिकारों की लड़ाई एक प्रमुख उदाहरण है।कई, कई, कई लोग हैं जिन्होंने समलैंगिक विवाह और अन्य समलैंगिक अधिकारों को रोकने/ब्लॉक करने के लिए काम किया है, जब वे, खुद, समलैंगिक नहीं हैं।यह केवल निर्णय पारित नहीं है - जो कि थोप रहा है।मैं यहाँ पूछ रहा हूँ कि आर्ची, आप, जेपी, आदि खुद ही हैं।'}
{'role': 'assistant', 'content': 'Non-sarcasm'}
{'role': 'user', 'content': 'गलत- मुझे लगता है कि ऑलसेन उन लोगों का हवाला देते हुए बहुत विशिष्ट है, जो उन्हें लगता है कि इस उदाहरण में निशान को खत्म कर

In [9]:
## Store the few shots in a .csv

import csv

# Initialize a list to hold Sentence-Tag pairs
sentence_tag_pairs = []

# Iterate through the few_shot_examples to pair user and assistant content
for i in range(0, len(formatted_few_shot_examples), 2):
    sentence = formatted_few_shot_examples[i]["content"]
    tag = formatted_few_shot_examples[i + 1]["content"]
    sentence_tag_pairs.append({"Sentence": sentence, "Tag": tag})

# Define the CSV file path
csv_file_path = "/data1/debajyoti/code-mix-humor-sarcasm-detection/outputs/sarcasm/few_shots/iacv2_hin_llama.csv"

# Write the Sentence-Tag pairs to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Sentence", "Tag"])
    writer.writeheader()  # Write the header
    writer.writerows(sentence_tag_pairs)  # Write the rows

### Create prompt, parse response and  generate response

In [10]:
from transformers import pipeline
import re
import json

# Define the function to create the prompt
def create_prompt(input, num_examples):

    # System prompt 
    system_prompt = {
        "role": "system",
        "content": """You are a sarcasm recognition assistant judging if an 'Input' is sarcastic or not.
        If the 'Input' is sarcastic, you need to give your final 'Output' as 'Sarcasm'. 
        If the 'Input' is non-sarcastic, you need to give your final 'Output' as 'Non-sarcasm'."""
    }

    # # System prompt 
    # system_prompt = {
    #     "role": "system",
    #     "content": """Classify as 'Humor' or 'Non-humor'."""
    # }
    
    
    # # Few-shot examples 
    # few_shot_examples = [
    #     {"role": "user", "content": "Why did the scarecrow win an award? Because he was outstanding in his field."},
    #     {"role": "assistant", "content": "Humor"},
    #     {"role": "user", "content": "This is a straightforward business email with no jokes."},
    #     {"role": "assistant", "content": "Non-humor"},
    # ]

    messages = [system_prompt] + formatted_few_shot_examples + [{"role": "user", "content": input}]

    return messages

def parse_response(response):
    # print(response)
    if len(response) > 0:        
        if "Non-sarcasm" in response:
            return "0"
        elif "Sarcasm" in response:
            return "1"
        else:# If no label is found, return None
            return None

def generate_responses(prompt, pipeline):

    response = pipeline(prompt, max_new_tokens=256)[0]["generated_text"][-1]['content']
    return response

In [11]:
def process_data(data_filepath, output_filepath):
    # df = pd.read_csv(data_filepath)
    df = test_df
    # df = df[:5]
    
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )
    
    results = []
    num_examples = 15 # number of shots in few-shot
    # Loop over the DataFrame and pass the text to generate response
    for index, row in tqdm(list(df.iterrows())):
        prompt = create_prompt(row['Sentence'], num_examples)
        model_response = generate_responses(prompt, pipeline)
        # print(model_response)
        generated_label = parse_response(model_response)

        # veracity, explanation = parse_response(model_response)
        
        results.append({
            'Sentence': row['Sentence'],
            'Label': row['Tag'],
            'Response': model_response,
            'Gen_label': generated_label,
            # 'Explanation': explanation
        })
        # break
    
    results = pd.DataFrame(results)
    
    return results

# Example file paths, replace with your actual paths
data_filepath = '/data1/debajyoti/code-mix-humor-sarcasm-detection/Dataset/splits/humor/test.csv'
output_filepath = '/data1/debajyoti/test/llms/data/humor'

# Run the data processing
result = process_data(data_filepath, output_filepath)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/525 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/525 [00:00<01:50,  4.73it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/525 [00:00<01:34,  5.53it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 3/525 [00:00<01:29,  5.86it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 4/525 [00:00<01:26,  6.01it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 5/525 [00:00<01:29,  5.79it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 6/525 [00:01<01:27,  5.94it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▏         | 7/525 [00:01<01:25,  6.03it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 8/525 [00:01<01:24,  6.10it/s]S

In [12]:
result

Unnamed: 0,Sentence,Label,Response,Gen_label
0,"Mera bhanja mujhe ""papa ka sala"" bulata hai......",0,Sarcasm,1
1,Tu sale TRIPLE TALAQ nd HALALA Ke baad ki sant...,0,Sarcasm,1
2,aur aap a0ni politics karne me vyast hai.. #Bi...,0,Sarcasm,1
3,Theek hai waha at least Janki ma ka mandir hai...,0,Sarcasm,1
4,Sir mujhe cricket acdemy join karna hai mai ka...,0,Non-sarcasm,0
...,...,...,...,...
520,aaj bjp aur chadhi gang aise khus ho rahe hai ...,0,Sarcasm,1
521,@SGanguly99 sir aapnio jodi aamar saath na den...,0,Sarcasm,1
522,#Irony: Pran ka pran chala gaya.. RIP Sher Khan!,1,Sarcasm,1
523,Ye bhi mat bhulo ki triple talaq ko ban lagane...,0,Sarcasm,1
