# Prediction using the trained Bert model
https://huggingface.co/anonymous/bert-finetuned-chatGPT-discourse

Note: This was used to predict the testing dataset, but later I predict the Twitter and Reddit dataset in the Bert_Training.ipynb file right after training the BERT model.

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import pandas as pd

In [None]:
# Loading the fine-tuned BERT model and tokenizer
model_name = "anonymous/bert-finetuned-chatGPT"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
model = model.to('cuda')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
gpt_data = pd.read_csv('/content/drive/MyDrive/datasets/chatGPT_Vader_n_Blob_Sentiment.csv')

In [None]:
gpt_data.head()

Unnamed: 0,Processed_Tweets,Vader_Polarity,Blob_Polarity
0,openais gpt4 just got supercharged ai chatgpt,0.0,0.0
1,classical art is struggling not changed the fa...,-0.25,0.2
2,alibaba invites businesses to trial chatgpt ri...,0.0,0.0
3,trying to stop students from using ai and chat...,-0.2263,0.1
4,i asked chatgpts ai chatbot how can i earn cry...,0.0,0.0


In [None]:
tokenized_texts = tokenizer(list(gpt_data['Processed_Tweets']), padding=True, truncation=True, max_length=512, return_tensors='pt')

In [None]:
from tqdm import tqdm

# chunk size prediction
chunk_size = 1000  # for low memory usage.

# Initializing an empty list to store predicted sentiments
predicted_sentiments = []

# Iterating over the input data in chunks
for i in tqdm(range(0, len(tokenized_texts['input_ids']), chunk_size)):
    # Extracting a chunk of input tensors
    input_ids_chunk = tokenized_texts['input_ids'][i:i+chunk_size].to('cuda')
    attention_mask_chunk = tokenized_texts['attention_mask'][i:i+chunk_size].to('cuda')

    # Passing the chunked data through the model to get predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids_chunk, attention_mask=attention_mask_chunk)

    # interpreting the predictions to determine the sentiment of each tweet in the chunk data
    predicted_labels_chunk = torch.argmax(outputs.logits, dim=1).tolist()
    sentiment_mapping = {2: 'Negative', 0: 'Neutral', 1: 'Positive'}
    predicted_sentiments.extend([sentiment_mapping[label] for label in predicted_labels_chunk])


100%|██████████| 363/363 [48:31<00:00,  8.02s/it]


In [None]:
gpt_data['predicted_labels'] = predicted_sentiments # adding the predicted labels to the dataset

In [None]:
print(gpt_data[['Processed_Tweets', 'predicted_labels']])

                                         Processed_Tweets predicted_labels
0          openais gpt4 just got supercharged ai chatgpt          Negative
1       classical art is struggling not changed the fa...         Negative
2       alibaba invites businesses to trial chatgpt ri...         Negative
3       trying to stop students from using ai and chat...         Negative
4       i asked chatgpts ai chatbot how can i earn cry...         Negative
...                                                   ...              ...
362561  rt jordanbpeterson wtf seriously a very seriou...         Negative
362562  is googles updated bard chatgpts strongest com...          Neutral
362563  rt itspaulai chatgpt has now a big problem goo...          Neutral
362564  rt sama all chatgpt plus users getting browsin...         Positive
362565  rt jordanbpeterson wtf seriously a very seriou...         Negative

[362566 rows x 2 columns]


In [None]:
gpt_data.to_csv('/content/drive/MyDrive/datasets/GPT_BERT_Sentiment_Prediction.csv', index=False) # exporting

In [None]:
import matplotlib.pyplot as plt

sentiment_counts = gpt_data['predicted_labels'].value_counts() # count of sentiments

# percentage formula
total_tweets = sentiment_counts.sum()
percentages = (sentiment_counts / total_tweets) * 100

# Plotting sentiment distribution
plt.figure(figsize=(8, 6))
bars = sentiment_counts.plot(kind='bar', color='skyblue')
plt.title('Sentiment Distribution of Labeled Tweets Using BERT')
plt.xlabel('Sentiment')
plt.ylabel('Count of Tweets')

# Adding percentage labels on top of each bar
for i, count in enumerate(sentiment_counts):
    plt.text(i, count + 1000, f'{count} ({percentages[i]:.2f}%)', ha='center')

plt.xticks(rotation=0)
plt.tight_layout()
plt.show()