<div style="text-align: center;" >
<h1 style="margin-top: 0.2em; margin-bottom: 0.1em;">Social Media Data Analysis - Final Project</h1>
<h4 style="margin-top: 0.7em; margin-bottom: 0.3em; font-style:italic">Data Processing</h2>
<h4 style="margin-top: 0.7em; margin-bottom: 0.3em; font-style:italic">Julia King</h4>
</div>
<br>

In [1]:
# import chunk

from common_functions import confirm_execution, read_video_json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from data_collection import confirm_execution

# manual sample
import random
import csv
from sklearn.model_selection import train_test_split

# tkinter app
import tkinter as tk
from tkinter import messagebox
import pandas as pd
import webbrowser


# 4. Based on the title and description, classify a sample of the videos as conspirative or non-conspirative by hand.

In [3]:
# functions to read/write the manual sample

def write_manual_sample(manual_sample : pd.DataFrame, filepath : str = "data/manual_sample.csv"):
    """Write the manual sample to a csv. Careful not to overwrite!

    Args:
        manual_sample (pd.DataFrame): manual sample
        filepath (str, optional): file path to save to. Defaults to "data/manual_sample.csv".
    """
    manual_sample.to_csv(filepath, index = False, quoting = csv.QUOTE_MINIMAL)
    
def read_manual_sample(filepath : str = "data/manual_sample.csv") -> pd.DataFrame:
    """Reads manual sample file, mostly here to complete the read/write function set.

    Args:
        filepath (str, optional): file path to manual sample file. Defaults to "data/manual_sample.csv".

    Returns:
        pd.DataFrame: maunual sample
    """
    manual_sample = pd.read_csv(filepath)
    manual_sample["conspirative"] = pd.to_numeric(manual_sample["conspirative"], errors = "coerce")
    manual_sample["conspirative"] = manual_sample["conspirative"].astype("boolean")
    return manual_sample

In [4]:
# function creating sample dataset

def sample_videos(video_data: list, n : int = 250, seed : int = 94) -> pd.DataFrame:
    # generate docstring by typing """
    
    # get n random videos
    random.seed(seed)
    vd_sample = random.sample(video_data, k = n)
    
    # create df
    sample = pd.DataFrame(data = {
        "video_id" : [video["video_id"] for video in vd_sample], 
        "title" : [video["metadata"]["snippet"]["title"] for video in vd_sample], 
        "description" : [video["metadata"]["snippet"]["description"] for video in vd_sample], 
        "conspirative" : pd.NA
    })
    return sample

In [5]:
# draw training sample

confirm_execution("Do you want to reset the manually evaluated sample?")

manual_sample = sample_videos(video_data, n = 400)

write_manual_sample(manual_sample)

Exception: Execution aborted by user.

In [6]:
# create tkinter app for labelling

class ConspiracyLabeler:
    def __init__(self, root, filepath):
        self.root = root
        self.filepath = filepath
        self.df = read_manual_sample(self.filepath)
        self.index = 0
        self.prev_index = None

        # display how many i still need to label
        self.counter_label = tk.Label(root, text = "", font = ("Arial", 12))
        self.counter_label.pack(pady = 10)

        # create title display
        self.root.title("Conspirative Labeler")
        self.title_label = tk.Label(root, text = "", font = ("Arial", 16))
        self.title_label.pack(pady = 10)

        # create scrollable description
        self.desc_frame = tk.Frame(root)
        self.desc_frame.pack(pady = 10, fill = "both", expand = True)
        self.canvas = tk.Canvas(self.desc_frame, height = 100)  # Set an initial height for the canvas
        self.scrollbar = tk.Scrollbar(self.desc_frame, orient = "vertical", command = self.canvas.yview)
        self.scrollable_frame = tk.Frame(self.canvas)
        
        self.canvas.configure(yscrollcommand = self.scrollbar.set)
        self.scrollbar.pack(side = "right", fill = "y")
        self.canvas.pack(side = "left", fill = "both", expand = True)
        self.canvas.create_window((0, 0), window = self.scrollable_frame, anchor = "nw")
        self.scrollable_frame.bind("<Configure>", lambda e: self.canvas.configure(scrollregion = self.canvas.bbox("all")))

        self.desc_label = tk.Label(self.scrollable_frame, text = "", wraplength = 400, justify = "left", font = ("Arial", 12))
        self.desc_label.pack()

        # add clickable link
        self.link_label = tk.Label(root, text = "Open video in browser", fg = "blue", cursor = "hand2", font = ("Arial", 12))
        self.link_label.pack(pady = 10)
        self.link_label.bind("<Button-1>", self.open_link)
        
        # other buttons
        self.undo_button = tk.Button(root, text = "Undo", command = self.undo, width = 10, state = "disabled")
        self.undo_button.pack(pady = 10)
        
        self.close_button = tk.Button(root, text = "Save and Close", command = self.shutdown, width = 12)
        self.close_button.pack(pady = 10)

        self.true_button = tk.Button(root, text = "[c]onspirative", command = lambda : self.label_conspiracy(True), width = 15)
        self.true_button.pack(side = "left", padx = 20)

        self.false_button = tk.Button(root, text = "[n]on-conspirative", command = lambda : self.label_conspiracy(False), width = 15)
        self.false_button.pack(side = "right", padx = 20)

        self.root.bind("c", lambda event : self.label_conspiracy(True))
        self.root.bind("n", lambda event : self.label_conspiracy(False))
        self.root.bind("<Configure>", self.update_wraplength)

        self.next()
    
    def shutdown(self, completed : bool = False):
        """Saves current dataframe to file and closes the app.
        
        Args:
            completed (bool): Should be set to true if the shutdown was triggered due to the user finishing the labels.
        """
        write_manual_sample(self.df, filepath = self.filepath)
        if (completed):
            messagebox.showinfo("Finished", "All videos have been labelled! \nThe app will now close.")
        else:
            messagebox.showinfo("Save successful!", "The app will now close. \nSee you next time!")
        self.root.destroy()

    def update_counter(self):
        """Updates the counter label with the number of labeled and remaining items.
        """
        total_items = len(self.df)
        labeled_items = len(self.df.dropna(subset = ["conspirative"]))
        remaining_items = total_items - labeled_items
        self.counter_label.config(text = f"Labeled: {labeled_items} / Remaining: {remaining_items}")
    
    def update_wraplength(self, event):
        """Updates the wraplength of the description based on the window width. Purely for aesthetics
        """
        new_width = self.canvas.winfo_width() - 20  # padding
        self.desc_label.config(wraplength = new_width)

    def update_index(self):
        """Finds the next row without an evaluation and sets the indexes accordingly.
        """
        self.prev_index = self.index
        for next_i in range(0, len(self.df)): # could've started the range at self.index as well, but this takes basically no time anyway and is safer.
            if (pd.isna(self.df.loc[next_i, "conspirative"])):
                self.index = next_i
                return
        self.index = None
        return
    
    def update_display(self):
        """Update the title and description to evaluate.
        """
        self.title_label.config(text = self.df.loc[self.index, "title"])
        self.desc_label.config(text = self.df.loc[self.index, "description"])
        self.update_counter()
    
    def undo(self):
        """Undoes the last evaluation & returns to the corresponding row.

        Raises:
            ValueError: Triggered if no previous index is available.
        """
        if (self.prev_index is None):
            raise ValueError("Undo was triggered without a previous action. This should not be possible.")
        self.df.at[self.prev_index, "conspirative"] = pd.NA
        self.index = self.prev_index
        self.prev_index = None
        self.undo_button["state"] = "disabled"
        self.update_display()
    
    def next(self):
        """Finds & displays the next unevaluated row.
        """
        self.update_index()
        self.undo_button["state"] = "normal"
        if not(self.index is None):
            self.update_display()
        else:
            self.shutdown(completed = True)
        return

    def label_conspiracy(self, value : bool):
        """Sets the label according to the pressed button

        Args:
            value (bool): True if conspirative, False otherwise.
        """
        self.df.at[self.index, "conspirative"] = value
        self.next()

    def open_link(self, event):
        """Opens the video in a new browser tab

        Args:
            event (_type_): unused, necessary bc of tkinter
        """
        video_id = self.df.loc[self.index, "video_id"]
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        webbrowser.open_new(video_url)

def launch_ConspiracyLabeler(filepath : str):
    """Launches the Conspiracy labeller to manually evaluate the videos.

    Args:
        filepath (str): relative path to manual sample.
    """
    label_root = tk.Tk()
    label_app = ConspiracyLabeler(label_root, filepath = filepath)
    label_root.mainloop()

In [9]:
# confirm and launch

confirm_execution("Do you want to launch the labelling tool?")

launch_ConspiracyLabeler("data/manual_sample.csv")

# 5. Employ supervised text analysis to classify the dataset as conspirative or non-conspirative. Evaluate the performance.

## 5.1 Create video metadata dataframe

This will include all relevant video-level variables. 

In [8]:
vd_nocomments = [{key : value for key, value in video.items() if key != "comments"} for video in video_data]

NameError: name 'video_data' is not defined

In [None]:
pd.json_normalize(vd_nocomments).columns

Index(['phase', 'week_start', 'video_id', 'metadata.kind', 'metadata.etag',
       'metadata.id', 'metadata.snippet.publishedAt',
       'metadata.snippet.channelId', 'metadata.snippet.title',
       'metadata.snippet.description',
       ...
       'metadata.localizations.es-ES.title',
       'metadata.localizations.es-ES.description',
       'metadata.localizations.sh.title',
       'metadata.localizations.sh.description',
       'metadata.localizations.ik.title',
       'metadata.localizations.ik.description',
       'metadata.localizations.iu.title',
       'metadata.localizations.iu.description',
       'metadata.localizations.da.title',
       'metadata.localizations.da.description'],
      dtype='object', length=362)

# 6. Use the emotion classification model LEIA to obtain the sentiments expressed in the comments.