In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
import os
import sys

os.chdir("..")

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import torch

import flippers

In [4]:
np.random.seed(1)

In [5]:
# load train, dev, test data
train = pd.read_csv("datasets/informative_youtube/train.csv")
dev = pd.read_csv("datasets/informative_youtube/dev.csv")
test = pd.read_csv("datasets/informative_youtube/test.csv")

In [6]:
train.shape, dev.shape, test.shape

((164950, 16), (20619, 16), (20619, 16))

In [7]:
train.sample(5)[["title", "channelTitle", "tags"]]

Unnamed: 0,title,channelTitle,tags
57427,Inter 4-0 Genoa | Inter kick off title defence...,Serie A,Ronaldo|Serie A|Dybala|highlights|Juventus|AC ...
132056,I Built Minecraft's Most HIDDEN Base,Wenzo,minecraft smp|dream smp|smp|minecraft|demisesm...
91229,I WASN'T READY! Obi-Wan Kenobi Episode 6 BREAK...,Star Wars Comics,darth vader|vader vs|star wars comics|star war...
18120,CASH NASTY AND JULIAN NEWMAN HEATED 2V2! | Cas...,Creator League,[None]
3924,I do not recommend: Sonic Frontiers (Review),Skill Up,skill up|skill|up|gameplay|games|guide|sonic f...


In [8]:
# Create labeling functions to find informative videos
from flippers.lfs.lfs import LF_List

lfs = LF_List()

OTHER = 0
INFORMATIVE = 1


@lfs.add(OTHER)
def contains_more_than_2_caps_lock_words(df):
    return df["title"].str.contains(r"[A-Z]{2,}")


@lfs.add(OTHER)
def contains_more_than_2_marks(df):
    return df["title"].str.contains(r"!|\?")


non_informative_tags = [
    "funny",
    "gaming",
    "game",
    "minecraft",
    "fortnite",
    "console",
    "ps4",
    "xbox",
    "nintendo",
    "movie",
    "netflix",
    "cartoon",
    "anime",
    "music",
    "song",
    "food",
    "vlog",
    "short",
    "unboxing",
    "tik",
    "review",
    "stream",
    "season",
    "top",
    "news",
    "trailer",
    "politics",
    "ball",
]


@lfs.add(INFORMATIVE)
def doesnt_contain_non_informative_tags(df):
    return ~df["tags"].str.lower().str.contains("|".join(non_informative_tags))


informative_tags = [
    "science",
    "technology",
    "education",
    "history",
    "philosophy",
    "psychology",
    "economics",
    "math",
    "mathematics",
    "physics",
    "chemistry",
    "biology",
    "medicine",
    "health",
    "engineering",
    "computer science",
    "programming",
]


@lfs.add(INFORMATIVE)
def contains_informative_tags(df):
    return df["tags"].str.lower().str.contains("|".join(informative_tags))

In [9]:
L_train = lfs.create_matrix(train)
L_train

Unnamed: 0,contains_more_than_2_caps_lock_words,contains_more_than_2_marks,doesnt_contain_non_informative_tags,contains_informative_tags
0,0.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
164945,1.0,1.0,0.0,0.0
164946,1.0,0.0,0.0,0.0
164947,1.0,0.0,0.0,0.0
164948,1.0,1.0,1.0,0.0


In [10]:
flippers.summary(L_train, lfs.polarities)

Unnamed: 0,polarity,coverage,confidence,overlaps,matches,conflicts
contains_more_than_2_caps_lock_words,0,0.448,1.0,0.273,0.119,0.206
contains_more_than_2_marks,0,0.194,1.0,0.159,0.119,0.092
doesnt_contain_non_informative_tags,1,0.457,1.0,0.247,0.018,0.238
contains_informative_tags,1,0.038,1.0,0.026,0.018,0.016


# Create label model

In [11]:
from flippers.models import SnorkelModel

label_model = SnorkelModel(lfs.polarities, 2, [0.9, 0.1])
label_model.fit(L_train, epochs=100)

In [12]:
train["y_pred_snorkel"] = label_model.predict_proba(L_train)[:, 1]

In [13]:
train.loc[train["y_pred_snorkel"] > 0.8, ["title", "tags", "y_pred_snorkel"]].sample(
    n=20
)

Unnamed: 0,title,tags,y_pred_snorkel
26102,Could the Higgs Boson Lead Us to Dark Matter?,Space|Outer Space|Physics|Astrophysics|Quantum...,0.867888
22756,The Truth about Nikola,engineering|science|technology|education|histo...,0.959692
110336,Natalie Noel's 6 Month Body Transformation,fitness|health|fitness journey|fitness documen...,0.959692
28109,Making a fart juice developed by the U.S. gove...,nileblue|nile|blue|red|science|chemistry|us|go...,0.959692
88025,The Plane That Will Change Travel Forever,engineering|science|technology|education|histo...,0.959692
41658,3 Years Living In An Abandoned Ghost Town!,Ghost Town Living|Brent Underwood|Cerro Gordo|...,0.867888
104929,Making the World's Purest Cookie,nileblue|nile|blue|red|science|chemistry|world...,0.959692
21933,The Strongest Magnets We've Ever Made,SciShow|science|Hank|Green|education|learn|com...,0.959692
105225,Big Red Machine - Renegade (feat. Taylor Swift...,Big Red Machine|Taylor Swift|Justin Vernon|Ren...,0.959692
63338,Feeding My Venus Flytrap Candy Instead Of Flies,Science|venus flytrap|candy|the action lab,0.959692


In [14]:
train.loc[train["y_pred_snorkel"] < 0.2, ["title", "tags", "y_pred_snorkel"]].sample(
    n=20
)

Unnamed: 0,title,tags,y_pred_snorkel
124313,EXCLUSIVE 992 GT3 RS vs Corvette Z06 vs BMW M4...,Hagerty|porsche|porsche 911|911 gt3 rs|gt3rs|9...,1.843079e-08
30946,When Anime Characters yell out their Attacks...,Anime|Attack on titan|One piece|my hero academ...,1.515499e-07
11635,We're having a BABY!!,pregnancy announcement|pregnant|telling my hus...,0.08771968
144400,Inside Miley Cyrus‚Äôs Versace Handbag | In The Bag,british vogue|miley cyrus|miley cyrus 2023|mil...,1.515499e-07
128326,Watch How Russia‚Äôs Military Attack on Ukraine ...,WSJ|The Wall Street Journal|Russia|Ukraine|Inv...,1.843079e-08
14246,Duke Dennis Goes Sneaker Shopping With Complex,sneakerhead|complex|complex originals|sneakers...,1.515499e-07
162991,JH Diesel Flipped His MegaTruck Into Cleetus' ...,lsxcalade|escalade|turbo|twin turbo|pfi speed|...,0.08771968
103472,Dog the Bounty Hunter Joins The Search for Bri...,People|people magazine|news|celebrities|interv...,1.843079e-08
120858,"After Waking From Two-Year Coma, Woman Names B...",NBC News Channel,1.515499e-07
110985,Fortnite Season 8 Carnage & Venom Mythic Symbi...,Fortnite|Fortnite Chapter 2 Season 8|All Bosse...,4.181558e-08


# Train a transformer

In [15]:
# !pip install transformers==4.28.0
# !pip install --upgrade accelerate evaluate

In [16]:
transformer_train = train.query("y_pred_snorkel > 0.8 or y_pred_snorkel < 0.2")
sentences = transformer_train["title"].values.tolist()
labels = (transformer_train["y_pred_snorkel"] > 0.8).astype(int).values.tolist()

In [17]:
from sklearn.model_selection import train_test_split

# Split your data into train and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, labels, random_state=42, test_size=0.1
)

In [18]:
from datasets import Dataset, DatasetDict

# Create Datasets for each split
train_dataset = Dataset.from_dict({"text": train_sentences, "labels": train_labels})
val_dataset = Dataset.from_dict({"text": val_sentences, "labels": val_labels})

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Undersample the training data to have a 50/50 split

# Get the number of positive samples in the training dataset
train_positives = sum(train_labels)

# Get the indices for the positive and negative samples
positive_indices = [i for i, label in enumerate(train_labels) if label == 1]
negative_indices = [i for i, label in enumerate(train_labels) if label == 0]

# Randomly sample from the negative indices with the same number of positive indices
random_negative_indices = np.random.choice(
    negative_indices, train_positives, replace=False
)

# Concatenate the positive indices with the random negative indices
under_sample_indices = np.concatenate([positive_indices, random_negative_indices])

# Use the under sample indices to sample the train_dataset
train_dataset = train_dataset.select(under_sample_indices)

In [20]:
# Combine them into a DatasetDict
dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset})


small_test_dataset = (
    dataset_dict["validation"].shuffle(seed=42).select([i for i in list(range(300))])
)

In [21]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [22]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

                                                                  

In [23]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

In [25]:
# Setup evaluation
import evaluate

metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [26]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-informative-model-3000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [27]:
# Train the model
trainer.train()

  0%|          | 0/444 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 444/444 [00:47<00:00,  9.39it/s]

{'train_runtime': 47.2807, 'train_samples_per_second': 150.252, 'train_steps_per_second': 9.391, 'train_loss': 0.24234649726936408, 'epoch': 2.0}





TrainOutput(global_step=444, training_loss=0.24234649726936408, metrics={'train_runtime': 47.2807, 'train_samples_per_second': 150.252, 'train_steps_per_second': 9.391, 'train_loss': 0.24234649726936408, 'epoch': 2.0})

In [28]:
# Compute the evaluation metrics
trainer.evaluate()

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19/19 [00:00<00:00, 46.57it/s]


{'eval_loss': 0.16165262460708618,
 'eval_f1': 0.3571428571428571,
 'eval_runtime': 0.4212,
 'eval_samples_per_second': 712.283,
 'eval_steps_per_second': 45.111,
 'epoch': 2.0}

In [29]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model=model, tokenizer=tokenizer, return_all_scores=True, device=0
)



In [30]:
pipe("Solving the Riemann Hypothesis")

[[{'label': 'LABEL_0', 'score': 0.007189882919192314},
  {'label': 'LABEL_1', 'score': 0.9928101301193237}]]

In [31]:
pipe("Super Smash Bros. Ultimate - The Fastest Way to Unlock All Characters")

[[{'label': 'LABEL_0', 'score': 0.9876031279563904},
  {'label': 'LABEL_1', 'score': 0.012396924197673798}]]

In [32]:
pipe("How to make a YouTube video")

[[{'label': 'LABEL_0', 'score': 0.023730454966425896},
  {'label': 'LABEL_1', 'score': 0.976269543170929}]]

In [33]:
dev = dev.sample(1000)
y_pred_dev = pipe(dev["title"].tolist())

In [34]:
y_pred_dev = [x[1]["score"] for x in y_pred_dev]

In [35]:
dev["y_pred"] = y_pred_dev

In [36]:
dev.loc[dev["y_pred"] > 0.9, ["title", "y_pred"]].sample(n=15)

Unnamed: 0,title,y_pred
13580,How these impossibly thin cuts are made,0.995277
16334,Richard Cabral explains how to stay out of tro...,0.973565
5376,How A Sheep Changed 2b2t History Forever,0.987565
13903,Why a Russian invasion of Ukraine appears immi...,0.988424
2893,I Transformed My Washing Machine into a Fish Tank,0.994063
19770,How I made friends with the scary art teacher,0.97866
18681,Social Constructs | Philosophy Tube,0.991551
12575,that feeling when you bite into a pickle and i...,0.943352
19038,The Making Of Fallin‚Äô | Why Don‚Äôt We,0.94478
3615,The Dark Side of Being an Influencer According...,0.963936


In [37]:
dev.loc[dev["y_pred"] < 0.3, ["title", "y_pred"]].sample(n=15)

Unnamed: 0,title,y_pred
4387,Mighty Morphin Power Rangers: Once & Always 30...,0.009529
18057,Special Look | The Mandalorian | Disney+,0.007669
10854,"Game Theory: FNAF, A Family REBUILT (Ultimate ...",0.010065
15431,The Funniest Minecraft Mod Ever,0.010894
10110,"Anuel AA, Dj Luian, Mambo Kingz - Mejor Que Yo...",0.017154
5211,Overnight In Bikini Bottom!,0.013045
18841,Hozwal ‚ùå Jay Wheeler - Cual De Los Dos ü•Ä ( VId...,0.009621
10753,Doja Cat ‚Äì You Right (Behind The Scenes),0.015674
4580,Ja Morant suspended from all Grizzlies' activi...,0.008036
11264,Amazing beautiful Cute and Smart Baby Dog / Lo...,0.022718
