# Chapter 5: Classified Sentiment Analysis for Social Media

Dear user, we will classify the social media comments, according to 5 key research categories, \
then perform sentiment analysis to identify the design opportunities with each of these categories.

### REQUIREMENTS

For this notebook, you need to have:
- 2 x Pickle files of Scraped data from your Social Media sources (from Chap2.ipynb)

### TO DO SECTION

In [11]:
'''
Dear user, enter your Product here!
'''

product = "Scoot 787"

In [12]:
'''
Dear user, enter your directories to the 2 Pickle files of Scraped data from Social Media!
'''
youtube = f"support/{product}/youtube/comment_list.pkl"
reddit = f"support/{product}/reddit/comment_list.pkl"

In [13]:
'''
Dear user, enter the 15 key components identified from DSM here!
'''
classify_candidates = ['Storage', 'Comfort', 'Speed', 'Design', 'Safety']

### RUN AS INTENDED (DO NOT CHANGE ANYTHING.)

In [14]:
! pip install accelerate
! pip install torch torchvision torchaudio



DEPRECATION: google-images-search 1.4.6 has a non-standard dependency specifier click>=7.0<=8.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of google-images-search or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063




DEPRECATION: google-images-search 1.4.6 has a non-standard dependency specifier click>=7.0<=8.1.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of google-images-search or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [15]:
""" Create Classify and Sentiment folder """
search_terms = product

import os
import shutil

# Create "classify" folder
try:
    os.makedirs(f"support/{search_terms}/classify")
except FileExistsError:
    shutil.rmtree(f"support/{search_terms}/classify")
    os.makedirs(f"support/{search_terms}/classify")

# Create "sentiment" folder
try:
    os.makedirs(f"support/{search_terms}/sentiment")
except FileExistsError:
    shutil.rmtree(f"support/{search_terms}/sentiment")
    os.makedirs(f"support/{search_terms}/sentiment")

In [16]:
""" Initialise and Establish Dataset """
import pandas as pd

youtube = pd.read_pickle(youtube)
reddit = pd.read_pickle(reddit)

In [17]:
import random

combined = youtube + reddit
print("Number of comments:", len(combined))

Number of comments: 2943


In [18]:
""" Classify comments by candidates """
import csv
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")

comment_list = combined

candidates = classify_candidates + ['Other']

# Initialize counters
candidate_counts = {candidate: 0 for candidate in candidates}

model = "facebook/bart-large-mnli"  # Default model
# model = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"

# Initialize CSV files for each category
csv_files = {candidate: open(f"support/{product}/classify/{candidate}_comments.csv", "w", newline="", encoding="utf-8") for candidate in candidates}
writers = {candidate: csv.writer(csv_files[candidate]) for candidate in candidates}

# Write header row for each CSV file
for writer in writers.values():
    writer.writerow(["Sequence", "Label", "Source"])

# Initialize zero-shot classification pipeline with fine-tuned parameters
classifier = pipeline(
    "zero-shot-classification",
    model=model,
    multi_label=True,
    device_map='auto'
)

labeled_comments_count = 0  # Initialize counter for labeled comments

# Write results to CSV for each comment
comment_list_2 = (c for c in comment_list if c.strip())

results = classifier(comment_list_2, candidate_labels = candidates)

for comment, result in zip(comment_list_2, results):
    sequence = result['sequence'] if result['labels'] else None
    label = result['labels'][0] if result['labels'] else None

    if label:
        candidate_counts[label] += 1
        labeled_comments_count += 1

        # Determine the source of the comment (YouTube or Reddit)
        source = "YouTube" if comment in youtube else "Reddit"

        # Write the comment to the respective CSV file based on its category and source
        writers[label].writerow([sequence, label, source])

# Close all CSV files
for file in csv_files.values():
    file.close()

# Print summary
print(f"Candidate Counts:")
for candidate, count in candidate_counts.items():
    print(f"{candidate}: {count}")
print("Detailed results written to respective CSV files.")

ValueError: You must include at least one label and at least one sequence.

In [None]:
import csv
from transformers import pipeline, AutoTokenizer

candidates = classify_candidates + ['Other']

for candidate in candidates:
    # Initialize sentiment counts for each candidate
    positive_count = 0
    negative_count = 0
    neutral_count = 0
    empty_comments_count = 0

    comment_list = []
    with open(f"support/{product}/classify/{candidate}_comments.csv", "r", encoding="utf-8") as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)  # Skip header row
        for row in csv_reader:
            comment_list.append(row[0])

    if not comment_list:
        continue

    # Define the maximum sequence length
    max_seq_length = 512  # Adjust Truncated Length

    # Model for sentiment analysis
    model = "cardiffnlp/twitter-roberta-base-sentiment"  # negative, neutral, positive

    # Initialise the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)

    # Filter out or truncate excessively long sequences
    filtered_comments = [comment[:max_seq_length - 2] for comment in comment_list]  # -2 to account for special tokens [CLS] and [SEP]

    # Initialise the pipeline with padding
    classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, padding=True, device=-1)

    results = classifier(filtered_comments)

    # Accumulate sentiment counts
    for i in range(len(results)):
        result = results[i]
        sentiment = result['label']
        if comment_list[i].strip():  # Check if the comment is not empty
            if sentiment == "LABEL_2" or sentiment == "POSITIVE":
                positive_count += 1
            elif sentiment == "LABEL_0" or sentiment == "NEGATIVE":
                negative_count += 1
            elif sentiment == "LABEL_1":
                neutral_count += 1
        else:  # If the comment is empty, count it
            empty_comments_count += 1

    # Output CSV
    with open(f"support/{product}/sentiment/{candidate}_analysis.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Comment", "Sentiment"])
        for i in range(len(results)):
            writer.writerow([comment_list[i], results[i]['label']])

    # Calculate overall sentiment for each candidate
    overall_sentiment = "Positive" if positive_count > negative_count else "Negative" if negative_count > positive_count else "Neutral"

    # Print summary for each candidate
    print(f"Candidate: {candidate}")
    print("Number of comments with Positive sentiment:", positive_count)
    print("Number of comments with Negative sentiment:", negative_count)
    print("Number of comments with Neutral sentiment:", neutral_count)
    print("Overall Sentiment:", overall_sentiment)
    print()


### Confusion Matrix

### TO DO SECTION

In [None]:
# '''
# Dear user, please manually annotate the classification for a selected number of comments in a post-classified csv file!
# Copy the csv file to others folder and name it Confusion_Table.csv !
# '''
# confusion_matrix = 'others/Confusion_Table.csv'

# """ Confusion Matrix """
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import precision_recall_fscore_support
# import pandas as pd

# '''load data'''
# labelled_data = pd.read_csv("confusion_matrix")

# print(labelled_data)

# y_true = list(labelled_data['Human'])
# y_pred = list(labelled_data["AI"])

# '''Compute'''
# print("\nConfusion Matrix summary:")
# print("Number of comments:", len(labelled_data))
# print("\nConfusion Table --- Labels: 0, 1, 2  |  Rows = Human (i.e. True)  |  Columns = AI (i.e. Predicted)")

# print(confusion_matrix(y_true, y_pred))
# print("\n(Precision, Recall, F1 Score)")
# print(precision_recall_fscore_support(y_true, y_pred, average='macro')[0:3])