<div style="text-align: center;" >
<h1 style="margin-top: 0.2em; margin-bottom: 0.1em;">Social Media Data Analysis - Final Project</h1>
<h4 style="margin-top: 0.7em; margin-bottom: 0.3em; font-style:italic">Data Processing</h2>
<h4 style="margin-top: 0.7em; margin-bottom: 0.3em; font-style:italic">Julia King</h4>
</div>
<br>

In [1]:
# import chunk

from common_functions import confirm_execution, read_video_json, read_manual_sample, write_manual_sample, read_video_df, write_video_df, convert_conspirative_to_bool

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy

# manual sample
import random
import csv
from sklearn.model_selection import train_test_split

# tkinter app
import tkinter as tk
from tkinter import messagebox
import pandas as pd
import webbrowser

# text cleaning
import regex as re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download(["stopwords", "punkt", "wordnet"])
from sklearn.feature_extraction.text import CountVectorizer

# models
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 4. Based on the title and description, classify a sample of the videos as conspirative or non-conspirative by hand.

In [2]:
# function creating sample dataset

def sample_videos(video_json: list, n : int = 250, seed : int = 94) -> pd.DataFrame:
    # generate docstring by typing """
    
    # get n random videos
    random.seed(seed)
    vd_sample = random.sample(video_json, k = n)
    
    # create df
    sample = pd.DataFrame(data = {
        "video_id" : [video["video_id"] for video in vd_sample], 
        "title" : [video["metadata"]["snippet"]["title"] for video in vd_sample], 
        "description" : [video["metadata"]["snippet"]["description"] for video in vd_sample], 
        "conspirative" : pd.NA
    })
    return sample

In [3]:
# draw training sample

confirm_execution("Do you want to reset the manually evaluated sample?")

manual_sample = sample_videos(video_json, n = 400)

write_manual_sample(manual_sample)

Exception: Execution aborted by user.

In [4]:
# create tkinter app for labelling

class ConspiracyLabeler:
    def __init__(self, root, filepath):
        self.root = root
        self.filepath = filepath
        self.df = read_manual_sample(self.filepath)
        self.index = 0
        self.prev_index = None

        # display how many i still need to label
        self.counter_label = tk.Label(root, text = "", font = ("Arial", 12))
        self.counter_label.pack(pady = 10)

        # create title display
        self.root.title("Conspirative Labeler")
        self.title_label = tk.Label(root, text = "", font = ("Arial", 16))
        self.title_label.pack(pady = 10)

        # create scrollable description
        self.desc_frame = tk.Frame(root)
        self.desc_frame.pack(pady = 10, fill = "both", expand = True)
        self.canvas = tk.Canvas(self.desc_frame, height = 100)  # Set an initial height for the canvas
        self.scrollbar = tk.Scrollbar(self.desc_frame, orient = "vertical", command = self.canvas.yview)
        self.scrollable_frame = tk.Frame(self.canvas)
        
        self.canvas.configure(yscrollcommand = self.scrollbar.set)
        self.scrollbar.pack(side = "right", fill = "y")
        self.canvas.pack(side = "left", fill = "both", expand = True)
        self.canvas.create_window((0, 0), window = self.scrollable_frame, anchor = "nw")
        self.scrollable_frame.bind("<Configure>", lambda e: self.canvas.configure(scrollregion = self.canvas.bbox("all")))

        self.desc_label = tk.Label(self.scrollable_frame, text = "", wraplength = 400, justify = "left", font = ("Arial", 12))
        self.desc_label.pack()

        # add clickable link
        self.link_label = tk.Label(root, text = "Open video in browser", fg = "blue", cursor = "hand2", font = ("Arial", 12))
        self.link_label.pack(pady = 10)
        self.link_label.bind("<Button-1>", self.open_link)
        
        # other buttons
        self.undo_button = tk.Button(root, text = "Undo", command = self.undo, width = 10, state = "disabled")
        self.undo_button.pack(pady = 10)
        
        self.close_button = tk.Button(root, text = "Save and Close", command = self.shutdown, width = 12)
        self.close_button.pack(pady = 10)

        self.true_button = tk.Button(root, text = "[c]onspirative", command = lambda : self.label_conspiracy(True), width = 15)
        self.true_button.pack(side = "left", padx = 20)

        self.false_button = tk.Button(root, text = "[n]on-conspirative", command = lambda : self.label_conspiracy(False), width = 15)
        self.false_button.pack(side = "right", padx = 20)

        self.root.bind("c", lambda event : self.label_conspiracy(True))
        self.root.bind("n", lambda event : self.label_conspiracy(False))
        self.root.bind("<Configure>", self.update_wraplength)

        self.next()
    
    def shutdown(self, completed : bool = False):
        """Saves current dataframe to file and closes the app.
        
        Args:
            completed (bool): Should be set to true if the shutdown was triggered due to the user finishing the labels.
        """
        write_manual_sample(self.df, filepath = self.filepath)
        if (completed):
            messagebox.showinfo("Finished", "All videos have been labelled! \nThe app will now close.")
        else:
            messagebox.showinfo("Save successful!", "The app will now close. \nSee you next time!")
        self.root.destroy()

    def update_counter(self):
        """Updates the counter label with the number of labeled and remaining items.
        """
        total_items = len(self.df)
        labeled_items = len(self.df.dropna(subset = ["conspirative"]))
        remaining_items = total_items - labeled_items
        self.counter_label.config(text = f"Labeled: {labeled_items} / Remaining: {remaining_items}")
    
    def update_wraplength(self, event):
        """Updates the wraplength of the description based on the window width. Purely for aesthetics
        """
        new_width = self.canvas.winfo_width() - 20  # padding
        self.desc_label.config(wraplength = new_width)

    def update_index(self):
        """Finds the next row without an evaluation and sets the indexes accordingly.
        """
        self.prev_index = self.index
        for next_i in range(0, len(self.df)): # could've started the range at self.index as well, but this takes basically no time anyway and is safer.
            if (pd.isna(self.df.loc[next_i, "conspirative"])):
                self.index = next_i
                return
        self.index = None
        return
    
    def update_display(self):
        """Update the title and description to evaluate.
        """
        self.title_label.config(text = self.df.loc[self.index, "title"])
        self.desc_label.config(text = self.df.loc[self.index, "description"])
        self.update_counter()
    
    def undo(self):
        """Undoes the last evaluation & returns to the corresponding row.

        Raises:
            ValueError: Triggered if no previous index is available.
        """
        if (self.prev_index is None):
            raise ValueError("Undo was triggered without a previous action. This should not be possible.")
        self.df.at[self.prev_index, "conspirative"] = pd.NA
        self.index = self.prev_index
        self.prev_index = None
        self.undo_button["state"] = "disabled"
        self.update_display()
    
    def next(self):
        """Finds & displays the next unevaluated row.
        """
        self.update_index()
        self.undo_button["state"] = "normal"
        if not(self.index is None):
            self.update_display()
        else:
            self.shutdown(completed = True)
        return

    def label_conspiracy(self, value : bool):
        """Sets the label according to the pressed button

        Args:
            value (bool): True if conspirative, False otherwise.
        """
        self.df.at[self.index, "conspirative"] = value
        self.next()

    def open_link(self, event):
        """Opens the video in a new browser tab

        Args:
            event (_type_): unused, necessary bc of tkinter
        """
        video_id = self.df.loc[self.index, "video_id"]
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        webbrowser.open_new(video_url)

def launch_ConspiracyLabeler(filepath : str):
    """Launches the Conspiracy labeller to manually evaluate the videos.

    Args:
        filepath (str): relative path to manual sample.
    """
    label_root = tk.Tk()
    label_app = ConspiracyLabeler(label_root, filepath = filepath)
    label_root.mainloop()

In [5]:
# confirm and launch

confirm_execution("Do you want to launch the labelling tool?")

launch_ConspiracyLabeler("data/manual_sample.csv")

# 5. Employ supervised text analysis to classify the dataset as conspirative or non-conspirative. Evaluate the performance.

## 5.1 Create video metadata dataframe

This will include all relevant video-level variables. 

In [2]:
video_json = read_video_json()
manual_sample = read_manual_sample()

In [3]:
vd_nocomments = [{key : value for key, value in video.items() if key != "comments"} for video in video_json]

In [4]:
# create function to make/reset video df

def create_video_df(video_json : list, manual_sample : pd.DataFrame) -> pd.DataFrame:
    """Creates an initial video dataframe from video_json. 

    Args:
        video_json (list): Obtained in data_collection
        manual_sample (pd.DataFrame) : the results of the manual sampling

    Returns:
        pd.DataFrame: dataframe containing relevant info from video metadata, manual eval results and empty cols for automatic labelling
    """
    
    relevant_cols = {
        "phase": "phase", 
        "week_start": "week_start", 
        "video_id": "id", 
        "metadata.snippet.title": "title", 
        "metadata.snippet.description": "description", 
        "metadata.statistics.viewCount": "views",
        "metadata.statistics.likeCount": "likes", 
        "metadata.statistics.commentCount": "comments"
    }
    
    # start with known fields
    video_df = pd.json_normalize(vd_nocomments)
    video_df = video_df.rename(columns = relevant_cols)
    video_df = video_df[[*relevant_cols.values()]]
    
    # add cols for automatic evaluation
    label_cols = ["conspirative_" + method for method in ["manual", "nbayes", "svm", "rforest"]]
    video_df[label_cols] = pd.NA
    for col in label_cols:
        video_df[col] = pd.to_numeric(video_df[col], errors = "coerce")
        video_df[col] = video_df[col].astype("boolean")
    
    # add manual sample results
    video_df["conspirative_manual"] = pd.merge(left = video_df, right = manual_sample[["video_id", "conspirative"]], left_on = "id", right_on = "video_id", how = "left")["conspirative"]

    # make id the index
    video_df = video_df.set_index("id")
    
    return video_df

In [5]:
# create/reset video df

confirm_execution("Do you want to reset the video df? All automatic labels will be lost.")

video_df = create_video_df(video_json, manual_sample)

write_video_df(video_df)

## 5.2 text preprocessing

In [2]:
video_df = read_video_df()

In [3]:
# function for preprocessing with subfunctions for individual processing

def clean_string(text : str) -> str:
    """if the text argument is a string, removes links & all non alpha-numeric characters, tokenizes, lemmatizes, and removes stopwords

    Args:
        text (str): string, ideally containing multiple words

    Returns:
        str: cleaned string
    """
    if type(text) != str:
        return text
    
    # remove links
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    
    # remove newline
    text = re.sub(r"\n", " ", text)
    
    # remove everything except alphabetical and numerical characters
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    
    # tokenize & lemmatize
    text = nltk.word_tokenize(text.lower())
    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word) for word in text]
    
    # remove stopwords
    stopwords = nltk.corpus.stopwords.words("english")
    text = [word for word in text if word not in stopwords]
    
    # join together again
    text = " ".join(text)

    return text

def clean_cols(df : pd.DataFrame) -> pd.Series:
    """takes all columns in data frames, combines their content into a single string and cleans them

    Args:
        df (pd.DataFrame): df to apply clean_string to

    Returns:
        pd.DataFrame: series containing the cleaned version of the combined texts
    """
    
    # create copy so original is not unintentionally overwritten
    df_copy = deepcopy(df)
    df_copy = df_copy.astype(str)

    texts = df_copy.agg(' '.join, axis = 1)
    
    return texts.apply(lambda text : clean_string(text))

In [4]:
# apply cleaning function

video_df["clean"] = clean_cols(video_df[["title", "description"]])

# reorder
cols = list(video_df.columns)
cols.remove("clean")
cols.insert(5, "clean")
video_df = video_df[cols]

video_df.head(3)

Unnamed: 0_level_0,phase,week_start,title,description,views,clean,likes,comments,conspirative_manual,conspirative_nbayes,conspirative_svm,conspirative_rforest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Gg38BtcNioY,0,2022-03-20,The Office of Community Development! 15-Minute...,🛣️Watch the next episode: https://youtu.be/Vd2...,991,office community development 15 minute city 10...,100.0,64,,,,
Ne6nrlB_3os,0,2022-03-20,URBTalk: 20 Minute Neighbourhood,URBTalk presentation on the 20 minute neighbou...,166,urbtalk 20 minute neighbourhood urbtalk presen...,1.0,0,,,,
2mezuB5BwnA,0,2022-03-20,Expo 2020 Dubai Will Reopen As A 15-Minute Cit...,.,114,expo 2020 dubai reopen 15 minute city 10 pavil...,0.0,0,,,,


In [5]:
# function that returns bag of words for given df

def get_bow(df : pd.DataFrame, text_col : str = "clean") -> pd.DataFrame:
    # generate docstring by typing """
    vectorizer = CountVectorizer(min_df = 5) # setting a minimum number of appearances to keep outliers out

    # get bag of words
    vectorizer.fit(df[text_col])
    bow_matrix = vectorizer.transform(df[text_col])
    bow_df = pd.DataFrame(bow_matrix.toarray(), index = df.index, columns = vectorizer.get_feature_names_out()) # convert to df and replace indexes with video id and word name
    
    # sort the df by the row sums in descending order
    bow_df = bow_df = bow_df[bow_df.sum(axis = 0).sort_values(ascending=False).index]
    return bow_df

In [6]:
# get bow for entire df

video_bow = get_bow(video_df)

video_bow.head(3)

Unnamed: 0_level_0,city,minute,15,video,de,channel,urban,like,use,nan,...,selection,buzz,monitored,buymeacoffee,alley,extend,sensory,monetary,integrate,belong
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gg38BtcNioY,6,2,2,1,0,1,0,2,2,0,...,0,0,0,0,0,0,0,0,0,0
Ne6nrlB_3os,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2mezuB5BwnA,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 5.3 prepare train and test sets

In [7]:
# extract manually sampled videos & set up train-test-split

manual_df = video_df.dropna(subset = ["conspirative_manual"])
manual_bow = get_bow(manual_df)

manual_bow.head(3)

Unnamed: 0_level_0,city,minute,15,video,de,like,use,channel,urban,news,...,larryskylines,edited,pollution,small,order,dystopia,transportation,break,training,committed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Vd2RawW6HAE,7,2,2,0,0,2,2,0,0,0,...,1,0,0,0,0,0,0,0,0,0
orpZAEIzIWc,0,3,0,1,0,1,2,0,0,0,...,0,0,0,0,0,0,0,0,1,0
bsuC9nkhxm0,4,3,3,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# set up train test split

train_x, test_x, train_y, test_y = train_test_split(manual_bow, manual_df["conspirative_manual"], test_size = 0.25, random_state = 94)

In [9]:
# prepare df containing the test set
test_df = manual_df.loc[test_x.index]

## 5.4 model training & prediction

In [10]:
# naive bayes
nbayes = GaussianNB()
nbayes.fit(train_x, train_y)

test_df["conspirative_nbayes"] = nbayes.predict(test_x)

In [11]:
# survey vector machine
svm = SVC()
svm.fit(train_x, train_y)

test_df["conspirative_svm"] = svm.predict(test_x)

In [12]:
# random forest
rforest = RandomForestClassifier()
rforest.fit(train_x, train_y)

test_df["conspirative_rforest"]  = rforest.predict(test_x)

In [14]:
# convert 1 to true and 0 to false

test_df = convert_conspirative_to_bool(test_df)

test_df.head(3)

Unnamed: 0_level_0,phase,week_start,title,description,views,clean,likes,comments,conspirative_manual,conspirative_nbayes,conspirative_svm,conspirative_rforest
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
zXhZ3_rxv4g,1,2023-02-12,15 Minute Cities | Report from Tiger Mountain,"In this Report, Richard discusses the globalis...",289,15 minute city report tiger mountain report ri...,15.0,6,True,False,True,True
2gUumw1q6d0,1,2023-02-19,Manifestación Oxford contra las ciudades de 1...,,2612,manifestaci n oxford contra la ciudades de 15 ...,94.0,7,False,True,True,False
B9OjtFp3dsc,1,2023-01-22,"""SUSTAINABLE DEVELOPMENT"" 15 MINUTE CITIES wit...",According to the website https://www.15minutec...,749,sustainable development 15 minute city big joh...,56.0,10,True,True,True,True


## 5.5 Evaluation

# 6. Use the emotion classification model LEIA to obtain the sentiments expressed in the comments.