In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers
from transformers import AutoModelForSequenceClassification
from transformers import AutoModel

In [None]:
train_data = pd.read_csv("/kaggle/input/emnlp20230-task1/train.csv",encoding='utf8')
train_data.head(20)

In [None]:
val_data = pd.read_csv("/kaggle/input/emnlp20230-task1/dev.csv",encoding='utf8')
val_data.head(20)

In [None]:
# Direct Violence 	2
# Passive Violence	1
# Non-Violence	0

## Splitting Data into (Violent and Non-Violent)---->Model1

In [None]:
# Violence	1
# Non-Violence	0

In [None]:
def split_main_data(label):
    return (1 if label>0 else 0)

In [None]:
train_data_splitted = train_data
train_data_splitted.label = train_data.label.apply(split_main_data)
train_data_splitted.head(20)

In [None]:
val_data_splitted = val_data
val_data_splitted.label = val_data.label.apply(split_main_data)
val_data_splitted.head(20)

In [None]:
train_data_splitted.to_csv("train_data_splitted.csv",index=False)
val_data_splitted.to_csv("val_data_splitted.csv",index=False)

## Model1 [Violence(1) Vs. Non-Violence(0)]

In [None]:
!pip install --upgrade pandas

In [None]:
!pip install --upgrade datasets

In [None]:
from datasets import load_dataset

In [None]:
load_train_data_splitted = load_dataset('csv',data_files="/kaggle/working/train_data_splitted.csv")

In [None]:
load_train_data_splitted

In [None]:
load_val_data_splitted = load_dataset('csv',data_files="/kaggle/working/val_data_splitted.csv")
load_val_data_splitted

In [None]:
splitted_data = load_train_data_splitted

In [None]:
splitted_data['validation'] = load_val_data_splitted['train']

In [None]:
splitted_data

In [None]:
from transformers import AutoTokenizer
model_name = "bert-base-multilingual-cased"
tokenizer1 = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer1(batch['text'], padding="max_length", max_length=230, truncation=True)

In [None]:
splitted_data_encoded  = splitted_data.map(tokenize,batched=True,batch_size=16)

In [None]:
splitted_data_encoded

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
num_labels=2
model1 = (AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels).to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels,preds)
    return {"Accuracy":acc,"Macro F1 Score": f1}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
def model_init(trial):
    return model1

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 16
logging_steps = len(splitted_data_encoded["train"])//batch_size
finetuned_model_name1 = f"{model_name}-VITD-m1"
training_args1 = TrainingArguments(report_to=None,
                                   output_dir = finetuned_model_name1,
                                  num_train_epochs=5,
                                  learning_rate=1e-5,
                                   seed=42,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
#                                   weight_decay=0.01,
                                  evaluation_strategy="epoch",
#                                    save_strategy = "epoch",
                                  disable_tqdm = False,
#                                    load_best_model_at_end = True,
                                  logging_steps = logging_steps,
                                  push_to_hub=True,
                                  log_level="error",
                                  )

In [None]:
from transformers import trainer

trainer1 = Trainer(model_init = model_init,
                  args=training_args1,
                  compute_metrics=compute_metrics,
                  train_dataset = splitted_data_encoded['train'],
                  eval_dataset = splitted_data_encoded['validation'],
                  tokenizer = tokenizer1,
#                   data_collator=data_collator
                  )

In [None]:
trainer1.train()

In [None]:
trainer1.push_to_hub()

In [None]:
preds_output = trainer1.predict(splitted_data_encoded['validation'])
preds_output.metrics

## Model2 [Direct-Violence(1) Vs. Passive-Violence(0)]

In [None]:
data = pd.read_csv("/kaggle/input/emnlp20230-task1/train.csv",encoding='utf8')

In [None]:
data.head(20)

In [None]:
violent_train_data = data[data.label!=0]

In [None]:
violent_train_data.head()

In [None]:
violent_train_data["label"] = violent_train_data['label'].apply(lambda x: 1 if x==2 else 0) 

In [None]:
violent_train_data.head()

In [None]:
data_val = pd.read_csv("/kaggle/input/emnlp20230-task1/dev.csv",encoding='utf8')
data_val.head(20)

In [None]:
len(data_val[data_val.label!=0])

In [None]:
violent_val_data = data_val[data_val.label!=0]
violent_val_data.head(20)

In [None]:
violent_val_data["label"] = violent_val_data['label'].apply(lambda x: 1 if x==2 else 0) 

In [None]:
violent_val_data.head(20)

In [None]:
violent_train_data.to_csv("violent_train_data.csv",index=False)
violent_val_data.to_csv("violent_val_data.csv",index=False)

In [None]:
from datasets import load_dataset

In [None]:
load_violent_train_data = load_dataset('csv',data_files="/kaggle/working/violent_train_data.csv")
load_violent_val_data = load_dataset('csv',data_files="/kaggle/working/violent_val_data.csv")

In [None]:
violent_data = load_violent_train_data
violent_data['validation'] = load_violent_val_data['train']

In [None]:
violent_data

In [None]:
from transformers import AutoTokenizer
model_name = "bert-base-multilingual-cased"
tokenizer2 = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer2(batch['text'], padding="max_length",max_length=230, truncation=True)

violent_data_encoded  = violent_data.map(tokenize,batched=True,batch_size=8)

In [None]:
violent_data_encoded

In [None]:
unique_train_labels = violent_data_encoded['train'].unique('label')
unique_validation_labels = violent_data_encoded['validation'].unique('label')

print("Unique labels in train split:", unique_train_labels)
# print("Unique labels in validation split:", unique_validation_labels)

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
num_labels=2
model2 = (AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels).to(device))

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels,preds)
    return {"Accuracy":acc,"Macro F1 Score": f1}

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 16
logging_steps = len(violent_data_encoded["train"])//batch_size
finetuned_model_name2 = f"{model_name}-VITD-m2"
training_args2 = TrainingArguments(report_to=None,
                                    output_dir = finetuned_model_name2,
                                  num_train_epochs=5,
                                  learning_rate=1e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                   save_strategy = "epoch",
                                  disable_tqdm = False,
                                  logging_steps =logging_steps,
                                   load_best_model_at_end = True,
                                  push_to_hub=True,
#                                    label_names = ["Passive Violence", "Direct Violence"]
                                  log_level="error",
                                  )

In [None]:
import os
os.environ["WANDB_DISABLED"] = "false"

In [None]:
from transformers import trainer

trainer2 = Trainer(model=model2,
                  args=training_args2,
                  compute_metrics=compute_metrics,
                  train_dataset = violent_data_encoded['train'],
                  eval_dataset = violent_data_encoded['validation'],
                  tokenizer = tokenizer2,
#                   data_collator=data_collator
                  )

In [None]:
trainer2.train()

In [None]:
trainer2.push_to_hub()

In [None]:
preds_output = trainer2.predict(violent_data_encoded['validation'])
preds_output.metrics

In [None]:
preds_output

## Test Prediction

In [None]:
test_data_df = pd.read_csv("/kaggle/input/emnlp20230-task1/test.csv",encoding='utf8')

In [None]:
test_data_df.head()

In [None]:
test_data = load_dataset('csv',data_files="/kaggle/input/emnlp20230-task1/test.csv")

### Using Model1 [Predicting between Violence(1) and Non-Violence(0)]

In [None]:
test_data_encoded = test_data.map(tokenize,batched=True,batch_size=8)

In [None]:
test_data_encoded

In [None]:
test_output = trainer1.predict(test_data_encoded["train"])

In [None]:
test_output

In [None]:
m1_output = test_output.predictions.argmax(-1)

In [None]:
m1_output

In [None]:
m1_output = pd.DataFrame(m1_output, columns=['label'])

In [None]:
m1_output_with_text_label = pd.concat([test_data_df, m1_output], axis=1)

In [None]:
m1_output_with_text_label

In [None]:
m1_output_with_text_label.to_csv("m1_output_with_text_label.csv")

### M2 for predicting the output of M1

In [None]:
violent_test_data = m1_output_with_text_label[m1_output_with_text_label['label']==1]

In [None]:
violent_test_data.head()

In [None]:
violent_test_data.to_csv("violent_test_data.csv",index=False)

In [None]:
violent_test_data_loaded = load_dataset('csv',data_files="/kaggle/working/violent_test_data.csv")

In [None]:
violent_test_data_loaded

In [None]:
violent_test_data_encoded  = violent_test_data_loaded.map(tokenize,batched=True,batch_size=8)

In [None]:
violent_test_data_encoded

In [None]:
m2_output = trainer2.predict(violent_test_data_encoded['train'])

In [None]:
m2_output3

In [None]:
violent_test_data

In [None]:
m2_output_with_text_label = pd.concat([violent_test_data, m2_output3], axis=1)

In [None]:
m2_output_with_text_label.shape

In [None]:
m2_output_with_text_label

In [None]:
m2_output_with_text_label.to_csv("m2_output_with_text_label.csv")

# API Call

### Using Model1 [Violence(1) vs non-violence(0)]

In [None]:
from transformers import AutoTokenizer, pipeline
import torch
import pandas as pd

# Load tokenizer and model
model = pipeline("text-classification", model="ka05ar/banglabert-VITD-m1", tokenizer="ka05ar/banglabert-VITD-m1")

# Load your data
data = pd.read_csv("/kaggle/input/emnlp20230-task1/test.csv", encoding='utf8')
texts = data["text"].tolist()

# Pass the texts to the model for classification
with torch.no_grad():
    predictions = model(texts)

# Get predicted probabilities and labels
predicted_labels = [pred['label'] for pred in predictions]
probs = [pred['score'] for pred in predictions]

# Create new columns in the DataFrame
data['predicted_label'] = predicted_labels
data['predicted_probability'] = probs

# Save the modified DataFrame to a new CSV file

# Print results
# for i, (predicted_label, prob) in enumerate(zip(predicted_labels, probs)):
#     print(f"Text: {texts[i]}")
#     print(f"Predicted Label: {predicted_label}")
#     print(f"Predicted Probability: {prob}")
#     print()
    


In [None]:
data

In [None]:
data['label'] = data['predicted_label'].apply(lambda x: 1 if x=='LABEL_1' else 0)

In [None]:
data

In [None]:
data.to_csv("test_predicted_results_m1.csv", index=False)

### Using Model2 [Passive Violence(0-->1) vs Direct-violence(1-->2)]

In [None]:
data

In [None]:
violent_only_data = data.copy()

In [None]:
violent_only_data = violent_only_data[violent_only_data['label']==1]

In [None]:
violent_only_data

In [None]:
violent_only_data = violent_only_data.drop(["predicted_label","predicted_probability","label"],axis=1)

In [None]:
violent_only_data

In [None]:
from transformers import AutoTokenizer, pipeline
import torch
import pandas as pd

# Load tokenizer and model
model = pipeline("text-classification", model="ka05ar/banglabert-VITD-m2", tokenizer="ka05ar/banglabert-VITD-m2")

# Load your data
# data = pd.read_csv("/kaggle/input/emnlp20230-task1/test.csv", encoding='utf8')
texts = violent_only_data["text"].tolist()

# Pass the texts to the model for classification
with torch.no_grad():
    predictions = model(texts)

# Get predicted probabilities and labels
predicted_labels = [pred['label'] for pred in predictions]
probs = [pred['score'] for pred in predictions]

# Create new columns in the DataFrame
violent_only_data['predicted_label'] = predicted_labels
violent_only_data['predicted_probability'] = probs

# Save the modified DataFrame to a new CSV file

# Print results
# for i, (predicted_label, prob) in enumerate(zip(predicted_labels, probs)):
#     print(f"Text: {texts[i]}")
#     print(f"Predicted Label: {predicted_label}")
#     print(f"Predicted Probability: {prob}")
#     print()

In [None]:
violent_only_data.head(20)

In [None]:
violent_only_data['label'] = violent_only_data['predicted_label'].apply(lambda x: 2 if x=='LABEL_1' else 1)

In [None]:
violent_only_data.head(20)

In [None]:
violent_only_data = violent_only_data.drop(["predicted_label","predicted_probability"],axis=1)

In [None]:
violent_only_data.head(20)

In [None]:
# Assuming df_original and df_changed have been defined

# Update only the rows with changed label values
changed_indices = violent_only_data.index
data.loc[changed_indices, 'label'] = violent_only_data['label']


In [None]:
data.head(20)

In [None]:
data.shape

In [None]:
data = data.drop(['predicted_label',"predicted_probability"],axis=1)

In [None]:
data.head(20)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     f1 = f1_score(labels, preds, average="macro")
# #     acc = accuracy_score(labels,preds)
#     return {"Macro F1 Score": f1}

In [None]:
preds = data['label']

In [None]:
original_output = pd.read_csv("/kaggle/input/emnlp-2023-task1-test/test_task1.csv",encoding='utf8')
labels = original_output['label']

In [None]:
macro_f1 = f1_score(labels, preds, average="macro")

In [None]:
macro_f1