In [1]:

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split as tts
import pandas as pd

In [2]:
!pip install emoji
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets




In [3]:
data=pd.read_csv('/content/drive/MyDrive/Santi_dataset.csv')
from datasets import Dataset
df = data[['EngText', 'Eng_Sentiment']].dropna()

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [4]:
from datasets import DatasetDict

# Split the dataset into training and test sets
dataset = dataset.train_test_split(test_size=0.3)
dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test']
})


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")


label_mapping = {-1: 0, 0: 1, 1: 2}  # change label to positive integer

def tokenize_and_format(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(examples['EngText'], padding='max_length', truncation=True, max_length=512)

    # Remap and include the labels
    if 'Eng_Sentiment' in examples:
        tokenized_inputs['labels'] = [label_mapping[int(label)] for label in examples['Eng_Sentiment']]

    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_format, batched=True)



Map:   0%|          | 0/748 [00:00<?, ? examples/s]

Map:   0%|          | 0/321 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis", num_labels=3)


In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

trainer.train()



Step,Training Loss
10,0.9254
20,1.0509
30,0.8134
40,0.7151
50,0.7076
60,0.8319
70,0.7384
80,0.8647
90,0.7734
100,0.7398


TrainOutput(global_step=282, training_loss=0.6330167583539976, metrics={'train_runtime': 10084.4299, 'train_samples_per_second': 0.223, 'train_steps_per_second': 0.028, 'total_flos': 590426509381632.0, 'train_loss': 0.6330167583539976, 'epoch': 3.0})

In [9]:
trainer.evaluate()

{'eval_loss': 0.9061704277992249,
 'eval_runtime': 444.1693,
 'eval_samples_per_second': 0.723,
 'eval_steps_per_second': 0.014,
 'epoch': 3.0}

In [14]:
#Compute Accuracy
import numpy as np
from sklearn.metrics import accuracy_score

predictions = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Calculate accuracy
accuracy = accuracy_score(tokenized_datasets['test']['labels'], predicted_labels)
print("Accuracy:", accuracy)


Accuracy: 0.7009345794392523


Import the scraped Youtube Comment and Predict the sentiment

In [42]:
df=pd.read_csv('/content/drive/MyDrive/YoutubeComment_Part1.csv')
df



Unnamed: 0.1,Unnamed: 0,author,published_at,updated_at,like_count,text
0,0,@the-fantabulous-g,2023-12-16T04:21:56Z,2023-12-16T04:21:56Z,1,Why are there no Asian men in the trailer? For...
1,1,@Edzyboy,2023-12-15T17:12:27Z,2023-12-15T17:12:27Z,0,Good job the books are actually finished. We ...
2,2,@jhtrq1465,2023-12-15T14:27:09Z,2023-12-15T14:27:09Z,0,Why on Earth Netflix have to give this masterp...
3,3,@SistoActivitatemAtm,2023-12-15T07:29:33Z,2023-12-15T07:29:33Z,0,Is this just going to be anti China propaganda...
4,4,@jeramahia123,2023-12-14T09:19:17Z,2023-12-14T09:19:17Z,0,"For those confused, most of the weird stuff yo..."
...,...,...,...,...,...,...
2460,2460,@Agent_Cormac,2023-06-17T21:40:16Z,2023-06-17T21:40:16Z,25,"Great books, and intrigued to see how it all t..."
2461,2461,@animebolt6915,2023-06-17T21:39:29Z,2023-06-17T21:39:29Z,2,❤
2462,2462,@bl7828,2023-06-17T21:39:01Z,2023-06-17T21:55:28Z,119,Just read the first book it was incredible. Ho...
2463,2463,@professor1262,2023-06-17T21:38:53Z,2023-06-17T21:38:53Z,59,I remember seeing that this was coming to Netf...


In [43]:
from langdetect import detect, LangDetectException

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Filter for English texts
df['is_english'] = df['text'].apply(is_english)
df_english = df[df['is_english']]


In [61]:
import re
import html

# Data Cleaning
def clean_text(text):
    # Convert HTML escapes like &amp; to their actual characters
    text = html.unescape(text)

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove non-ASCII characters (optional)
    text = text.encode('ascii', 'ignore').decode('ascii')

    # Remove email addresses (optional)
    text = re.sub(r'\S+@\S+', '', text)

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

df_english['text'] = df_english['text'].apply(clean_text)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_english['text'] = df_english['text'].apply(clean_text)


In [83]:
max_length_chars = 256  # Set a character limit 

df_english['cleaned_text'] = df_english['text'].apply(lambda x: x[:max_length_chars])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_english['cleaned_text'] = df_english['text'].apply(lambda x: x[:max_length_chars])


In [84]:

tokenized_comment = tokenizer(
    list(df_english['cleaned_text']),
    padding='max_length',
    truncation=True,
    max_length=512,
    return_tensors="pt"
)



from datasets import Dataset

# Convert to a Dataset object
comment_dataset = Dataset.from_dict({key: value for key, value in tokenized_comment.items()})




In [86]:
predictions = trainer.predict(comment_dataset)
predicted_labels = predictions.predictions.argmax(-1)


In [90]:
count = np.count_nonzero(predicted_labels == 2)  #Positive Comment
print(count)

892


In [91]:
count = np.count_nonzero(predicted_labels == 1) # Neutral Comment
print(count)

421


In [92]:
count = np.count_nonzero(predicted_labels == 0) # Negative Comment
print(count)

900


In [98]:


tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

positive_sentences = []
positive_class_index = 2  # or the index that represents 'positive'

for idx, label in enumerate(predicted_labels):
    if label == positive_class_index:
        
        tokenized_input = comment_dataset[idx]['input_ids']
        # Decode the tokens to the original sentence
        sentence = tokenizer.decode(tokenized_input, skip_special_tokens=True)
        positive_sentences.append(sentence)

# Now positive_sentences contains all sentences predicted as positive

positive_sentences

['Good job the books are actually finished. We all know what happens when D&D have to carry on the story for themselves.',
 'Sweet jehovaI am so jealous of anyone who doesnt know whats coming with this story.Best three sci fi books Ive ever read as far as mind blowing imagination and existential horror',
 'just finished the book. Totally love it, from the sci fi, the philosophy, the messages, the ending. Hooe the netflix one can handle all of these aspects. I dont care if they change the casts to be multicultural. The story is about earth afterall. But I hop',
 "last night I finished the last book of the trilogy and it's the craziest story I've ever read. everything civilizations did, it never really mattered",
 'I approved obama and read, perfect',
 "I've already watched the Chinese made one, but I'm still gonna watch this one.Though Netflix is gonna have a hard time topping the guy they chose to play Da Shi",
 "I've just finished watching the Tencent production. It a great adaptation

In [100]:
negative_sentences = []

negative_class_index = 0 # 0 is the label for negative

for idx, label in enumerate(predicted_labels):
    if label == negative_class_index:
        # Retrieve the tokenized input
        tokenized_input = comment_dataset[idx]['input_ids']
        # Decode the tokens to the original sentence
        sentence = tokenizer.decode(tokenized_input, skip_special_tokens=True)
        negative_sentences.append(sentence)

# Now negative_sentences contains all sentences predicted as negative
negative_sentences

['Why are there no Asian men in the trailer? For a book adapted from China, you\'d expect a "diverse" cast to actually have more than 1 Asian female (who traditionally test better with audiences) star in the show. We missed out another chance to open the pipe',
 'Why on Earth Netflix have to give this masterpiece to the worst showrunners of the last 10 years?',
 'Is this just going to be anti China propaganda? Thanks guys who ruined GoT, gonna love ur political commentary',
 'For those confused, most of the weird stuff you see in the trailer is from a virtual reality game the characters are playing.',
 "So, one season, season finale cliffhanger, and then canceled? Yeah, count me out.Maybe when services stop canceling shows on cliffhanger endings, I'll start watching again. Until then I have better things to do than waste my time on something that will pro",
 "Love the books, but the adaptation is made by Netflix, so they'll probably screw it up",
 'We will not be watching, D&D writing 