In [None]:
# setup and packages:

!pip install datasets transformers[torch] evaluate sentencepiece einops

In [None]:
from google.colab import drive

drive.mount('/content/drive')

%cd "drive/MyDrive/konfuzio/"

Mounted at /content/drive
/content/drive/MyDrive/konfuzio


In [None]:
!python --version

Python 3.10.12


# Million Jokes from reddit

In [None]:
import os
import pandas as pd

# csv file can be downloaded here: https://data.world/lexyr/one-million-reddit-jokes
reddit_file = os.path.join("data", "one-million-reddit-jokes.csv")

reddit_jokes = pd.read_csv(reddit_file)


# EDA

In [None]:
reddit_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   type            1000000 non-null  object
 1   id              1000000 non-null  object
 2   subreddit.id    1000000 non-null  object
 3   subreddit.name  1000000 non-null  object
 4   subreddit.nsfw  1000000 non-null  bool  
 5   created_utc     1000000 non-null  int64 
 6   permalink       1000000 non-null  object
 7   domain          1000000 non-null  object
 8   url             4472 non-null     object
 9   selftext        995525 non-null   object
 10  title           1000000 non-null  object
 11  score           1000000 non-null  int64 
dtypes: bool(1), int64(2), object(9)
memory usage: 84.9+ MB


In [None]:
# idea: low rating for jokes, which are nsfw?

reddit_jokes["subreddit.nsfw"].value_counts()

# ok turns out we have None anyway.

False    1000000
Name: subreddit.nsfw, dtype: int64

In [None]:
reddit_jokes["id"].unique().size

1000000

In [None]:
print(reddit_jokes["selftext"][1], reddit_jokes["title"][1])

It's called Google Sheets. Did you know Google now has a platform for recording your bowel movements?


In [None]:
reddit_jokes["score"][1]

9

In [None]:
reddit_jokes["score"].describe()

count    1000000.000000
mean         139.691119
std         1674.046005
min            0.000000
25%            0.000000
50%            1.000000
75%            7.000000
max       142733.000000
Name: score, dtype: float64

In [None]:
pd.set_option('display.max_colwidth', None)

reddit_jokes.query("score == score.max()")["selftext"]

29360    RIP Larry Tesler, the UI designer that created Cut, Copy and Paste, died age 74
Name: selftext, dtype: object

In [None]:
reddit_jokes.nlargest(5,['score'])["selftext"]

29360                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              RIP Larry Tesler, the UI designer that created Cut, Copy and Paste, died age 74
511072                                               

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(1,10))

scaled_score_full = pd.DataFrame(scaler.fit_transform(reddit_jokes["score"].values.reshape(-1,1)))
scaled_score_full.describe()

# Problem: extremly left screwed,
# 75 percent quantile is 1.000441.


Unnamed: 0,0
count,1000000.0
mean,1.008808
std,0.105557
min,1.0
25%,1.0
50%,1.000063
75%,1.000441
max,10.0


# Start processing reddit_jokes.

In [None]:
# we usually need to combine title and selftext to get the full joke:
# plus there is some "EDIT: comment" saved on some post, dont need that,
# sicne it is usually not part of the actual joke

reddit_jokes["full_text"] = reddit_jokes["title"] + " \n " + reddit_jokes["selftext"]
reddit_jokes["full_text"] = reddit_jokes.full_text.str.replace("(EDIT).*","")

  reddit_jokes["full_text"] = reddit_jokes.full_text.str.replace("(EDIT).*","")


In [None]:
reddit_jokes['number_words'] = reddit_jokes['full_text'].str.split().str.len()

In [None]:
reddit_jokes[reddit_jokes["number_words"] < 10]["number_words"].value_counts()

9.0    60168
8.0    58989
7.0    48309
6.0    33372
5.0    22697
4.0    16540
3.0    16486
2.0    11460
1.0        4
Name: number_words, dtype: int64

In [None]:
from typing import Tuple

MIN_SCORE: int = 10
WORDS_BETWEEN: Tuple[int, int] = (10, 120) # intervall for allowed number of words

reddit_jokes_filtered = reddit_jokes[(reddit_jokes["score"] > MIN_SCORE) & (reddit_jokes['number_words'].between(*WORDS_BETWEEN))]

reddit_jokes_filtered["score"].describe()

# 370_000 left

count    155856.000000
mean        663.598341
std        3653.748666
min          11.000000
25%          17.000000
50%          34.000000
75%         113.000000
max      142733.000000
Name: score, dtype: float64

In [None]:
# we need to map the upvotes/scores to a int-scale 1-10.
# as seen on the quantiles of score, the data is extremly screwed.
# a lot of jokes/entries with very low scores,
# and some outliers with huge scores.
# with this mapping I try to make the data evenly distributed over 1-10.

max_score: float = reddit_jokes_filtered["score"].max()

thresholds = [13, 17, 23, 35, 70, 125, 200, 400, 700, max_score]
values = list(range(1,11)) # [1,2,3..10]



def remap_score(score: float, max: float) -> int:
  if score <= MIN_SCORE:
    raise ValueError("Score smaller than filtered out MIN_SCORE, this should not happen")
  for threshold, value in zip(thresholds, values):
    if score <= threshold:
      return value
  else:
    return ValueError("bigger than max, we whould not end up in here")


reddit_jokes_filtered["score_remap"] = reddit_jokes_filtered["score"].map(lambda x: remap_score(x, max_score))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit_jokes_filtered["score_remap"] = reddit_jokes_filtered["score"].map(lambda x: remap_score(x, max_score))


In [None]:
print(f"{reddit_jokes_filtered['score_remap'].value_counts().sort_index()} \n"
  f"sum: {reddit_jokes_filtered['score_remap'].value_counts().sum()}")

1     22394
2     19477
3     18215
4     19848
5     24523
6     14630
7      9126
8      9744
9      5316
10    12583
Name: score_remap, dtype: int64 
sum: 155856


In [None]:
# distribution seems fine, though we still have 170_000 jokes, which is way too
# much to train. so I randomly sample 1000 for a first training try:
from datasets import Dataset

SIZE_TRAIN: int = 25_000
SIZE_VAL: int = 5_000

reddit_processed_df = reddit_jokes_filtered[["full_text", "score_remap"]].sample(SIZE_TRAIN + SIZE_VAL)


reddit_processed_df = reddit_processed_df.rename(columns={"full_text": "text", "score_remap": "label"})

# runtime error when no float:
reddit_processed_df['label'] = reddit_processed_df['label'].astype(float)

reddit_ds = Dataset.from_pandas(reddit_processed_df)

reddit_ds

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 30000
})

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch import nn

DEVICE: str = 'cuda' if torch.cuda.is_available() else 'cpu'

# define classification head to use and load pretrained bert and its tokenizer:
# we use a tiny variant of bert for this with only 4,4M params ans 2 heads.
# this modes was introduced in here: https://arxiv.org/abs/2110.01518

classification_head = nn.Sequential(
        (nn.Linear(128, 32)),
        nn.ReLU(),
        nn.Dropout(0.25),
        (nn.Linear(32, 1))).to(DEVICE)

# "bert-base-cased"
BASE_MODEL_NAME: str = "prajjwal1/bert-tiny"


def load_bert_model(model_name: str = BASE_MODEL_NAME, device: str = DEVICE,
                    clas_head: nn.Sequential = classification) -> Tuple[
    AutoModelForSequenceClassification,AutoTokenizer]:

  model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=1).to(device)
  model.classifier = classification_head_tiny

  tokenizer = AutoTokenizer.from_pretrained(model_name)
  return (model, tokenizer)


model, tokenizer = load_bert_model()


Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [None]:
model.classifier

Sequential(
  (0): Linear(in_features=128, out_features=32, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.25, inplace=False)
  (3): Linear(in_features=32, out_features=1, bias=True)
)

In [None]:
def tokenize_function(examples):
  if BASE_MODEL_NAME == "prajjwal1/bert-tiny":
    return tokenizer(examples["text"], padding="max_length",
                     truncation=True, max_length=116)
  else:
    return tokenizer(examples["text"], padding="max_length",
                     truncation=True)


tokenized_dataset = reddit_ds.map(tokenize_function, batched=True)

reddit_dataset_train = tokenized_dataset.select(range(SIZE_TRAIN))
reddit_dataset_val = tokenized_dataset.select(range(SIZE_TRAIN, SIZE_TRAIN + SIZE_VAL))

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("mae")

def compute_metrics_func(eval_pred):
    logits, labels = eval_pred
    return metric.compute(predictions=logits.squeeze(), references=labels.squeeze())

In [None]:
# training hyperparams:
# values not set can be looked at here:
# https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments

from transformers import TrainingArguments, Trainer
import torch

training_args = TrainingArguments(output_dir="test_trainer",
                                  evaluation_strategy="epoch",
                                  logging_strategy="epoch",
                                  num_train_epochs=10)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=reddit_dataset_train,
    eval_dataset=reddit_dataset_val,
    compute_metrics=compute_metrics_func
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Mae
1,7.7083,7.218498,2.198949
2,7.4185,7.151471,2.183524
3,7.1051,7.198524,2.201655
4,6.7233,7.507399,2.248331
5,6.2458,7.733501,2.272404
6,5.8928,8.106391,2.320309
7,5.5983,8.254775,2.340333
8,5.3611,8.308726,2.338639
9,5.2211,8.510023,2.368796
10,5.0309,8.642743,2.389112


TrainOutput(global_step=31250, training_loss=6.23051353125, metrics={'train_runtime': 560.1677, 'train_samples_per_second': 446.295, 'train_steps_per_second': 55.787, 'total_flos': 72640302000000.0, 'train_loss': 6.23051353125, 'epoch': 10.0})

In [None]:

def predict(input_text: str) -> float:
  inputs = tokenizer(input_text, return_tensors="pt").to(DEVICE)
  with torch.no_grad():
    pred = model(**inputs).logits
  if pred < 1:
    return 1
  if pred > 10:
    return 10
  else:
    return round(pred.item())

predict("wow what a funny joke this is")

5

In [None]:
# finally, save model config and weights to use in actual codebase

save_dir: str = "./data/bert_tiny/"

if not os.path.isdir(save_dir):
  os.makedirs(save_dir)
  print("created folder : ", save_dir)


model.save_pretrained("./data/bert_tiny/")