<a href="https://colab.research.google.com/github/kkrusere/youTube-comments-Analyzer/blob/main/SAnalysis_on_YT_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%shell
pip install bitsandbytes
pip install accelerate
pip install trl peft
pip install datasets
pip install rouge-score
pip install evaluate
pip install huggingface_hub


In [None]:
import re
import json
import random
import time

import evaluate
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split

import torch
import torch.nn as nn
from datasets import Dataset
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model

from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import transformers
from transformers import (
                            AutoModelForCausalLM,
                            AutoTokenizer,
                            BartForConditionalGeneration,
                            BartTokenizer,
                            BitsAndBytesConfig,
                            EarlyStoppingCallback,
                            logging,
                            pipeline,
                            Trainer,
                            TrainingArguments,
)


import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive, userdata
from huggingface_hub import login

import os
import json
#mounting google drive
drive.mount('/content/drive')

########################################

#changing the working directory
os.chdir("/content/drive/MyDrive/NLP_Data")

!pwd

huggingface_token = userdata.get('Hugging_Face_Hub_API_TOKEN')

#logging into huggingface
login(huggingface_token, add_to_git_credential=True)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP_Data/train_valid_data.csv')
df.head()

In [None]:
print(
    f"""
        Channel Name: {df['channel_name'][0]}
        Video Title: {df['video_title'][0]}
        Description: {df['video_description'][0]}
        Comment Text: {df['comment_text'][0]}
        \n
        Sentiment: {df['Sentiment'][0]}
        Explanaition: {df['Explanation'][0]}


    """
)

#returns
        # Channel Name: BBC
        # Video Title: Can Cuttlefish camouflage in a living room? | Richard Hammond's Miracles of Nature - BBC
        # Description: The final episode of Richard Hammond’s Miracles Of Nature. Richard is once again investigating the extraordinary super-powers of the animal kingdom. Cuttlefish survive by being able to blend into their surroundings through camouflage. Richard Hammond puts this to the test and experiments if the fish are able to camouflage in a tank set up like a living room.
        # Comment Text: The big white square on his back was impressive af even tho it wasn't fooling our human perception.


        # Sentiment: Positive
        # Explanaition: The comment expresses admiration for the cuttlefish's camouflage abilities, despite it not being completely convincing to humans.


In [None]:
# Load Data
test_df = pd.read_csv('/content/drive/MyDrive/NLP_Data/test_data.csv')
train_valid_data = pd.read_csv('/content/drive/MyDrive/NLP_Data/train_valid_data.csv')

# Split the dataset into training and validation sets (80-20 split)
train_df, val_df = train_test_split(train_valid_data, test_size=0.2, random_state=42)

# Initialize tokenizer and model
model_name = "facebook/bart-large-cnn"  # BART model name
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Data Preparation
def format_data(df, for_test=False):
    return [
        {
            "input": f"Channel: {row['channel_name']}, Title: {row['video_title']}, Description: {row['video_description']}, Comment Text: {row['comment_text']}",
            "output": f"Sentiment: {row['Sentiment']}, Explanation: {row['Explanation']}" if not for_test else "Sentiment: , Explanation: "
        }
        for _, row in df.iterrows()
    ]

# Format the data
formatted_train_data = format_data(train_df)
formatted_val_data = format_data(val_df)
formatted_test_data = format_data(test_df, for_test=True)

# Convert to Dataset objects
train_dataset = Dataset.from_list(formatted_train_data)
val_dataset = Dataset.from_list(formatted_val_data)
test_dataset = Dataset.from_list(formatted_test_data)

# Tokenization
def tokenize_data(example):
    model_inputs = tokenizer(
        example["input"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["output"],
            max_length=128,
            padding="max_length",
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_data, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_data, batched=True)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)
lora_model = get_peft_model(model, lora_config)

In [None]:
# Training Arguments with Optimizations
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=24,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=1000,                 # Increased warmup steps
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=500,
    gradient_accumulation_steps=8,     # Simulate larger batch size
    fp16=True,                         # Mixed precision training
    learning_rate=1e-5,                # Optimized learning rate
    lr_scheduler_type="linear",        # Linear decay
    load_best_model_at_end=True,       # Save best model
    metric_for_best_model="eval_loss", # Track best model by validation loss
)

# Add Early Stopping
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Trainer with Early Stopping
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

# Train the model
trainer.train()

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

In [None]:
# Save the Fine-Tuned Model
lora_model.save_pretrained("./SA-bart-fine-tuned-lora-model")
tokenizer.save_pretrained("./SA-bart-fine-tuned-lora-model")

In [None]:
# Push to Hugging Face Hub
from huggingface_hub import notebook_login

notebook_login()

lora_model.push_to_hub("kkrusere/SA-bart-fine-tuned-lora-model")
tokenizer.push_to_hub("kkrusere/SA-bart-fine-tuned-lora-model")

In [None]:
# Load the fine-tuned model and tokenizer
model = BartForConditionalGeneration.from_pretrained("./SA-bart-fine-tuned-lora-model")
tokenizer = BartTokenizer.from_pretrained("./SA-bart-fine-tuned-lora-model")

# Load the test data
test_df = pd.read_csv('/content/drive/MyDrive/NLP_Data/test_data.csv')

# Prepare the test data for inference
def format_test_data(df):
    return [
        {
            "input": f"Channel: {row['channel_name']}, Title: {row['video_title']}, Description: {row['video_description']}, Comment Text: {row['comment_text']}",
            "output": ""  # For test data, the output is not needed
        }
        for _, row in df.iterrows()
    ]

# Format and convert test data to Dataset
formatted_test_data = format_test_data(test_df)
test_dataset = Dataset.from_list(formatted_test_data)

# Tokenize the test data
def tokenize_data(example):
    model_inputs = tokenizer(
        example["input"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    return model_inputs

# Tokenize the test dataset
tokenized_test_dataset = test_dataset.map(tokenize_data, batched=True)

# Inference
def infer(model, tokenizer, dataset):
    model.eval()
    predictions = []
    for example in dataset:
        inputs = tokenizer(example['input'], return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model.generate(**inputs)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
    return predictions

# Get predictions
predictions = infer(model, tokenizer, tokenized_test_dataset)




Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
predictions[8]

"Sentiment: Negative, Explanation: The comment expresses negative feelings about Tom Hanks, suggesting a negative reaction to the film, suggesting that the actor's dislike of Hanks is the biggest crime of all. The comment has a negative tone, suggesting negative feelings toward Hanks and the film."