In [None]:
!pip install transformers
!pip install torch


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
data = pd.read_csv('/kaggle/input/final-project/india_gpt_labelled.csv')



In [None]:
data.head()

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
cleaned_comments=[]
for c in tqdm(data['comments'],total=len(data['comments']),desc='Cleaning'):
    og_comment=c
#     try:
#         c=remove_html_tags(c)
#         c=handle_urls_and_mentions(c)
#         c=remove_special_characters(c)
#         c=convert_to_lowercase(c)
    c=remove_stopwords(c)
        #     c=lemmatize_text(c)
#         c=lemmatize_text_spacy(c)
#     except Exception as e:
#         print(f"Error extracting keywords for {og_comment} : {e}")
#         continue
        
    cleaned_comments.append(c)

In [None]:
data['cleaned_comments']=cleaned_comments

In [None]:
data['manual_labels']=data['manual_labels']+1
data['gpt_labels']=data['gpt_labels']+1


In [None]:
data.head()

In [None]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=69)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Custom Dataset instances
max_len = 500
train_dataset = CustomDataset(texts=train_data['cleaned_comments'].values, labels=train_data['manual_labels'].values, tokenizer=tokenizer, max_len=max_len)
val_dataset = CustomDataset(texts=val_data['cleaned_comments'].values, labels=val_data['manual_labels'].values, tokenizer=tokenizer, max_len=max_len)

# DataLoader instances
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Training parameters
epochs = 3
lr = 0.01
optimizer = AdamW(model.parameters(), lr=lr)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

In [None]:
for epoch in range(epochs):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
model.eval()
val_preds = []
val_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc='Validation'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        val_preds.extend(preds)
        val_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(val_labels, val_preds)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

In [None]:
val_preds

In [None]:
unique, counts = np.unique(val_preds, return_counts=True)
value_counts = dict(zip(unique, counts))
print(value_counts)

In [None]:
pred_df = pd.DataFrame({'pred': val_preds, 'label': val_labels})
display(pred_df.sample(5, random_state=69))

In [None]:
pred_df['correct'] = pred_df.label == pred_df.pred
class_size = pred_df.groupby('label').size()
class_accuracy = pred_df.groupby('label')['correct'].mean() * 100
class_size_dict = class_size.to_dict()
class_accuracy_dict = class_accuracy.to_dict()
print("Class Size:", class_size_dict)
print("Class Accuracy:", class_accuracy_dict)