In [2]:
import sys
from os import path
import os
from dotenv import load_dotenv

load_dotenv()
sys.path.append(path.dirname(os.getcwd()))

hf_token = os.environ["HUGGING_FACE_API_KEY"]


In [3]:
import json
import pandas as pd
from project.models import YouTubeVideoInfo

with open("/home/leoli/Uni/Polimi/Thesis/master-thesis/data/youtube-common-10k/videos_infos.json", "r") as json_file:
    videos = [YouTubeVideoInfo.from_json(v) for v in json.load(json_file)]

print(len(videos))

9218


In [44]:
# Save experiment result utility
import json
from project.utils.json_utils import EnhancedJSONEncoder
from project.experiments.models import Experiment

def save_experiment(attribute_settings, attributes, completions_by_model, description, end_time, experiment_id, models, start_time, system_prompt):
    experiment = Experiment.from_completions(
        attributes_settings=attribute_settings,
        attributes=attributes,
        completions_by_model=completions_by_model,
        description=description,
        end_time=end_time,
        id=experiment_id,
        image_filename_format=None,
        models=models,
        start_time=start_time,
        system_prompt=system_prompt,
    )

    with open(f"experiment-{experiment_id}.json", "w") as f:
        json.dump(experiment, f, cls=EnhancedJSONEncoder)

In [46]:
from huggingface_hub import InferenceClient
from project.llm_models import prompts
from datetime import datetime, timezone
import time


hf = InferenceClient(token=hf_token)

description = "Assess performance of Mistral7B in the wild. With the simple prompt."
experiment_id = str(round(time.time()))
start_time = datetime.now(timezone.utc)
save_every_n_videos = 50
resume_from_file = False # "/home/leoli/Uni/Polimi/Thesis/master-thesis/notebooks/experiment-1738491036.json"

models = ["mistralai/Mistral-7B-Instruct-v0.2"] # ["meta-llama/Llama-3.1-8B-Instruct", "mistralai/Mistral-7B-Instruct-v0.2", "google/gemma-1.1-7b-it"] # LLama3 (8B), Mistral (7B), Gemma (7B)
system_roles = ["system", "system", None]
system_prompt = prompts.tiktok_paper_simple_prompt()
attributes = ["channel_title", "title", "description", "categories", "tags", "subtitles", "auto_subtitles", "comments"]
attributes_settings = {
    "max_subtitles_length": 1000, 
    "include_comments_replies": True,
}

completions_by_model = {}

if resume_from_file:
    with open(resume_from_file, "r") as f:
        experiment = Experiment.from_json(json.load(f))
    description = experiment.description
    experiment_id = experiment.id
    models = experiment.models
    start_time = experiment.start_time
    system_prompt = experiment.system_prompt
    attributes = experiment.attributes
    attributes_settings = experiment.attributes_settings
    completions_by_model = experiment.completions_by_model_and_video_id

for model, system_role in zip(models, system_roles):
    if not completions_by_model.get(model):
        completions_by_model[model] = {}
    for i, video in enumerate(videos):
        print(f"Processing video {i + 1}/{len(videos)} ⚙️")
        if video.id in completions_by_model[model]:
            print(f"Skipping video {video.id} as it has already been processed")
            continue

        user_prompt = video.to_string_for_model_input(attributes_to_include=attributes, max_subtitles_length=attributes_settings["max_subtitles_length"], include_comments_replies=attributes_settings["include_comments_replies"])

        messages = [
            {
                "role": system_role,
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            },
        ] if system_role else [
            {
                "role": "user",
                "content": f"{system_prompt}\n\n{user_prompt}"
            }
        ]
        try:
            completion = hf.chat.completions.create(
                model=model, 
                messages=messages, 
            )
        except Exception as exception:
            completion = str(exception)
            print(completion)
            
        completions_by_model[model][video.id] = completion

        if (i + 1) % save_every_n_videos == 0:
            save_experiment(
                attribute_settings=attributes_settings,
                attributes=attributes,
                completions_by_model=completions_by_model,
                description=description,
                end_time=datetime.now(timezone.utc),
                experiment_id=experiment_id,
                models=models,
                start_time=start_time,
                system_prompt=system_prompt,
            )

save_experiment(
    attribute_settings=attributes_settings,
    attributes=attributes,
    completions_by_model=completions_by_model,
    description=description,
    end_time=datetime.now(timezone.utc),
    experiment_id=experiment_id,
    models=models,
    start_time=start_time,
    system_prompt=system_prompt,
)

Processing video 1/9218 ⚙️
Processing video 2/9218 ⚙️
Processing video 3/9218 ⚙️
Processing video 4/9218 ⚙️
Processing video 5/9218 ⚙️
Processing video 6/9218 ⚙️
Processing video 7/9218 ⚙️
Processing video 8/9218 ⚙️
Processing video 9/9218 ⚙️
Processing video 10/9218 ⚙️
Processing video 11/9218 ⚙️
Processing video 12/9218 ⚙️
Processing video 13/9218 ⚙️
Processing video 14/9218 ⚙️
Processing video 15/9218 ⚙️
Processing video 16/9218 ⚙️
Processing video 17/9218 ⚙️
Processing video 18/9218 ⚙️
Processing video 19/9218 ⚙️
Processing video 20/9218 ⚙️
Processing video 21/9218 ⚙️
Processing video 22/9218 ⚙️
Processing video 23/9218 ⚙️
Processing video 24/9218 ⚙️
Processing video 25/9218 ⚙️
Processing video 26/9218 ⚙️
Processing video 27/9218 ⚙️
Processing video 28/9218 ⚙️
Processing video 29/9218 ⚙️
Processing video 30/9218 ⚙️
Processing video 31/9218 ⚙️
Processing video 32/9218 ⚙️
Processing video 33/9218 ⚙️
Processing video 34/9218 ⚙️
Processing video 35/9218 ⚙️
Processing video 36/9218 ⚙️
P

In [4]:
import langdetect
from project.utils.subtitles_utils import text_from_subtitles

def detect_language(video: YouTubeVideoInfo) -> str | None:
    try:
        return langdetect.detect(video.description)
    except Exception as e:
        pass
    if video.subtitles:
        try: 
            return langdetect.detect(text_from_subtitles(video.subtitles))
        except Exception as e:
            pass
    if video.auto_subtitles:
        try:
            return langdetect.detect(text_from_subtitles(video.auto_subtitles))
        except Exception as e:
            pass
    try:
        return langdetect.detect(video.title)
    except Exception as e:
        return None


english_videos = [video for video in videos if detect_language(video) == "en"]

In [51]:
experiment_file_path = "/home/leoli/Uni/Polimi/Thesis/master-thesis/notebooks/experiment-1738920290.json"

with open(experiment_file_path, "r") as f:
    experiment = Experiment.from_json(json.load(f))


for model, labels_by_id in experiment.predicted_labels_by_model_and_video_id.items():
    print(f"Predictions for model: {model}")
    print("Conspiracy videos in english:")
    for video in english_videos:
        if isinstance(labels_by_id.get(video.id), dict) and labels_by_id.get(video.id).get("is_conspiracy"):
            print(f"https://www.youtube.com/watch?v={video.id}")         
    print("All detected conspiracy videos:")   
    for video in videos:
        if isinstance(labels_by_id.get(video.id), dict) and labels_by_id.get(video.id).get("is_conspiracy"):
            print(f"https://www.youtube.com/watch?v={video.id}\t{detect_language(video)}")    

Predictions for model: mistralai/Mistral-7B-Instruct-v0.2
Conspiracy videos in english:
https://www.youtube.com/watch?v=1wF4mQLX-14
https://www.youtube.com/watch?v=6a_ZqMQI8dE
https://www.youtube.com/watch?v=7y3gsNmcDj0
https://www.youtube.com/watch?v=qrkXQlUBcGI
https://www.youtube.com/watch?v=1LPbFT0LyM0
https://www.youtube.com/watch?v=iPwGBNUANvk
https://www.youtube.com/watch?v=RkdWZOy2TjU
https://www.youtube.com/watch?v=DHn_1zGlocs
https://www.youtube.com/watch?v=4EjGbeWLBpM
https://www.youtube.com/watch?v=2H51-4Aj5UI
https://www.youtube.com/watch?v=2067N8sO3-4
https://www.youtube.com/watch?v=0xxTMO7_Bao
https://www.youtube.com/watch?v=cEmEq9RMTa8
https://www.youtube.com/watch?v=kzcZqcyW5Pk
https://www.youtube.com/watch?v=G0TdDvvuF0Q
https://www.youtube.com/watch?v=v6Kk-EzEgNM
https://www.youtube.com/watch?v=fchsqVdL4UU
https://www.youtube.com/watch?v=1ZA5KJ2DNQk
https://www.youtube.com/watch?v=Bq26qo7mSiw
https://www.youtube.com/watch?v=3sSH-h41J4Y
https://www.youtube.com/watch?v=

In [52]:
for model, labels_by_id in experiment.predicted_labels_by_model_and_video_id.items():
    print(f"Predictions for model: {model}")
    print("Conspiracy videos in english:")
    negative = 0
    for video in english_videos:
        if isinstance(labels_by_id.get(video.id), dict) and not labels_by_id.get(video.id).get("is_conspiracy"):
            print(f"https://www.youtube.com/watch?v={video.id}")

Predictions for model: mistralai/Mistral-7B-Instruct-v0.2
Conspiracy videos in english:
https://www.youtube.com/watch?v=_6Ef5HE3opA
https://www.youtube.com/watch?v=w4mthn8lzVE
https://www.youtube.com/watch?v=h8q1nxRYVgI
https://www.youtube.com/watch?v=9CElAu-TsiQ
https://www.youtube.com/watch?v=ojf2pADESmk
https://www.youtube.com/watch?v=HT6WxG_XcO4
https://www.youtube.com/watch?v=O1Kp07opq94
https://www.youtube.com/watch?v=TWIXumDS8qk
https://www.youtube.com/watch?v=OOMq-zOKQu8
https://www.youtube.com/watch?v=Ke0w8GLRRJk
https://www.youtube.com/watch?v=7EFaTlBgPl4
https://www.youtube.com/watch?v=5z_hNLwv6Fk
https://www.youtube.com/watch?v=c0PanNVqKXI
https://www.youtube.com/watch?v=3otvQpuZCmg
https://www.youtube.com/watch?v=swE57g8Lt3E
https://www.youtube.com/watch?v=akK3ojAjNrI
https://www.youtube.com/watch?v=E5gxhfHEg8Y
https://www.youtube.com/watch?v=JDJ4wV364k0
https://www.youtube.com/watch?v=wi0pn0ouBxs
https://www.youtube.com/watch?v=GPr1HfBWYJ0
https://www.youtube.com/watch?v=

In [53]:
for model, labels_by_id in experiment.predicted_labels_by_model_and_video_id.items():
    print(model)
    correctly_processed = [video.id for video in videos if isinstance(labels_by_id.get(video.id), dict)]
    print(f"Correctly Processed: {len(correctly_processed)}")
    print(f"Not correctly Processed: {len(videos) - len(correctly_processed)}")
    positive_labeled = [id for id in correctly_processed if labels_by_id[id]["is_conspiracy"]]
    print(f"Positive labeled: {len(positive_labeled)}")
    print(f"English videos: {len(english_videos)}")
    correctly_processed_english_videos = [video.id for video in english_videos if isinstance(labels_by_id.get(video.id), dict)]
    print(f"Correctly Processed English Videos: {len(correctly_processed_english_videos)}")
    print(f"Not correctly Processed English Videos: {len(english_videos) - len(correctly_processed_english_videos)}")
    english_positive_labeled = [id for id in correctly_processed_english_videos if labels_by_id[id]["is_conspiracy"]]
    print(f"English Positive labeled: {len(english_positive_labeled)}")


mistralai/Mistral-7B-Instruct-v0.2
Correctly Processed: 9217
Not correctly Processed: 1
Positive labeled: 158
English videos: 6950
Correctly Processed English Videos: 6949
Not correctly Processed English Videos: 1
English Positive labeled: 119


In [16]:
from itertools import takewhile, islice
import json
from project.experiments.models import Experiment

# Load experiment results
llama_experiment_file_path = "/home/leoli/Uni/Polimi/Thesis/master-thesis/notebooks/experiment-1738877044.json"
mistral_experiment_file_path = "/home/leoli/Uni/Polimi/Thesis/master-thesis/notebooks/experiment-1738920290.json"

with open(llama_experiment_file_path, "r") as f:
    llama_experiment = Experiment.from_json(json.load(f))

with open(mistral_experiment_file_path, "r") as f:
    mistral_experiment = Experiment.from_json(json.load(f))

# Extract labels
llama_labels = llama_experiment.predicted_labels_by_model_and_video_id[llama_experiment.models[0]]
mistral_labels = mistral_experiment.predicted_labels_by_model_and_video_id[mistral_experiment.models[0]]

# Filter English videos and those labeled differently by the models
differently_labeled_videos = [
    video for video in english_videos
    if (isinstance(llama_labels.get(video.id), dict) and isinstance(mistral_labels.get(video.id), dict))
    and (llama_labels.get(video.id)["is_conspiracy"] != mistral_labels.get(video.id)["is_conspiracy"])
]

# Fill the remaining labels with a balanced set of positive and negatively labeled instances
negative_labeled_videos = [video 
                           for video in english_videos
                           if video not in differently_labeled_videos
                           and (isinstance(llama_labels.get(video.id), dict) and isinstance(mistral_labels.get(video.id), dict))
                           and not llama_labels.get(video.id)["is_conspiracy"]
]
                                  
positive_labeled_labeled_videos = [video 
                           for video in english_videos
                           if video not in differently_labeled_videos
                           and (isinstance(llama_labels.get(video.id), dict) and isinstance(mistral_labels.get(video.id), dict))
                           and llama_labels.get(video.id)["is_conspiracy"]
]

# Print the list
print("Different Labels:")
for video in differently_labeled_videos:
    llama_label = llama_labels[video.id]["is_conspiracy"]
    mistral_label = mistral_labels[video.id]["is_conspiracy"]
    print(f"https://www.youtube.com/watch?v={video.id}\t{llama_label}\t{mistral_label}")

print("Common Labels:")
for i, (positive, negative) in enumerate(zip(positive_labeled_labeled_videos, negative_labeled_videos)):
    print(f"https://www.youtube.com/watch?v={positive.id}\t{True}\t{True}")
    print(f"https://www.youtube.com/watch?v={negative.id}\t{False}\t{False}")
    if i >= 50:
        break

Different Labels:
https://www.youtube.com/watch?v=egBx1uG0hrs	True	False
https://www.youtube.com/watch?v=tcoZTuaDnWc	True	False
https://www.youtube.com/watch?v=BJ9GWieBrZA	True	False
https://www.youtube.com/watch?v=DVZVTzNB7AA	True	False
https://www.youtube.com/watch?v=H-6nvIiL1s8	True	False
https://www.youtube.com/watch?v=qrkXQlUBcGI	False	True
https://www.youtube.com/watch?v=1LPbFT0LyM0	False	True
https://www.youtube.com/watch?v=dQdvj1zvCdE	True	False
https://www.youtube.com/watch?v=KHYY12U_Idw	True	False
https://www.youtube.com/watch?v=iPwGBNUANvk	False	True
https://www.youtube.com/watch?v=RkdWZOy2TjU	False	True
https://www.youtube.com/watch?v=DHn_1zGlocs	False	True
https://www.youtube.com/watch?v=-0L5lkJdPJg	True	False
https://www.youtube.com/watch?v=2H51-4Aj5UI	False	True
https://www.youtube.com/watch?v=wnPKPl_ZTOE	True	False
https://www.youtube.com/watch?v=0xxTMO7_Bao	False	True
https://www.youtube.com/watch?v=kzcZqcyW5Pk	False	True
https://www.youtube.com/watch?v=04VUi_0NCp0	Tru