In [1]:
!pip install -q transformers datasets sentencepiece scikit-learn torch torchvision torchaudio evaluate accelerate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import os
import random
import numpy as np
import pandas as pd
from collections import Counter
from typing import List
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)


In [3]:
from google.colab import files

print("Upload the training and test JSON files when prompted.")
uploaded = files.upload()


Upload the training and test JSON files when prompted.


Saving z639_assignment1_test.json to z639_assignment1_test.json
Saving z639_assignment1_training.json to z639_assignment1_training.json


In [4]:
TRAIN_FILE = "z639_assignment1_training.json"
TEST_FILE = "z639_assignment1_test.json"

def load_jsonl(path, has_labels=True):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    df = pd.DataFrame(rows)
    if has_labels:
        def majority_label(ct):
            labels = [c[0] for c in ct]
            cnt = Counter(labels)
            if cnt[True] == cnt[False]:
                return True
            return cnt[True] > cnt[False]
        df["label"] = df["composite_toxic"].apply(majority_label)
    else:
        df["label"] = None
    return df

train_df = load_jsonl(TRAIN_FILE, True)
test_df = load_jsonl(TEST_FILE, False)

print("Train size:", len(train_df))
print("Test size:", len(test_df))
train_df.head()


Train size: 4000
Test size: 500


Unnamed: 0,text,parent_comment,article_title,article_url,platform,platform_id,composite_toxic,label
0,"WTF, y'all never made MRE fart balloons in the...",,Triangular UFO hovers over California military...,https://www.dailymail.co.uk/news/article-12112...,reddit,jlcm021,"[[False, 74], [True, 323], [False, 1028], [Fal...",False
1,No apologies !! McCall has balls ! Ccp is not...,,China sentences elderly US citizen to life in ...,https://www.cnn.com/2023/05/15/china/china-jai...,youtube,Ugws8gNW7eJyE9VHeM14AaABAg,"[[False, 216], [False, 197], [False, 1039], [F...",False
2,What ever you need to tell yourself to sleep a...,I wonder how many undercover agents will be go...,Jan. 6 defendant who put foot on desk in Pelos...,https://www.cbsnews.com/news/richard-barnett-j...,youtube,UgxHlqwNcVssLHUr4yF4AaABAg.9q7kOunSlu-9q7lHH4he6S,"[[True, 192], [True, 193], [True, 260], [True,...",True
3,@exZACKly @CBSNews Fuck off Nazi,@NCmylo @CBSNews Lol. Stop choosing to be an ...,19-year-old Missouri man arrested in U-Haul cr...,https://www.cbsnews.com/news/u-haul-crash-lafa...,twitter,1661025155047637000,"[[True, 92], [False, 218], [True, 69], [True, ...",True
4,Texas is a republican sponsored killing ground...,,At Least 8 Killed After Driver Plows Car Into ...,https://www.nytimes.com/2023/05/07/us/car-pede...,youtube,UgwpAfn9RIV0cHfhp4R4AaABAg,"[[False, 56], [True, 207], [False, 218], [Fals...",False


In [5]:
def compose_text(row):
    parts = []
    if row.get("article_title"):
        parts.append(row["article_title"])
    if row.get("parent_comment"):
        parts.append(row["parent_comment"])
    if row.get("text"):
        parts.append(row["text"])
    return " [SEP] ".join([p for p in parts if p])

train_df["full_text"] = train_df.apply(compose_text, axis=1)
test_df["full_text"] = test_df.apply(compose_text, axis=1)

print("Sample text example:")
print(train_df.iloc[0]["full_text"])


Sample text example:
Triangular UFO hovers over California military base in new footage [SEP] WTF, y'all never made MRE fart balloons in the stumps?

Fucking kids these days.


In [6]:
from sklearn.model_selection import train_test_split

train_df_split, val_df = train_test_split(
    train_df, test_size=0.15, random_state=SEED, stratify=train_df["label"]
)

print("Train split:", len(train_df_split))
print("Validation split:", len(val_df))
print(train_df["label"].value_counts())


Train split: 3400
Validation split: 600
label
False    2974
True     1026
Name: count, dtype: int64


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

tfidf_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)),
    ("clf", LogisticRegression(solver="liblinear", class_weight="balanced", max_iter=1000, random_state=SEED))
])

print("Training TF-IDF + Logistic Regression...")
tfidf_pipe.fit(train_df_split["full_text"], train_df_split["label"])

def eval_model(model, X, y):
    preds = model.predict(X)
    return {
        "accuracy": accuracy_score(y, preds),
        "f1": f1_score(y, preds),
        "precision": precision_score(y, preds),
        "recall": recall_score(y, preds)
    }

tfidf_metrics = eval_model(tfidf_pipe, val_df["full_text"], val_df["label"])
print("TF-IDF Validation Metrics:", tfidf_metrics)


Training TF-IDF + Logistic Regression...
TF-IDF Validation Metrics: {'accuracy': 0.775, 'f1': 0.5573770491803278, 'precision': 0.5629139072847682, 'recall': 0.551948051948052}


In [8]:

!pip install -q datasets pandas

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


df = pd.read_json("z639_assignment1_training.json", lines=True)


def get_majority_vote(composite_toxic):
    true_count = sum(1 for label, _ in composite_toxic if label)
    return 1 if true_count > len(composite_toxic) / 2 else 0

df["label"] = df["composite_toxic"].apply(get_majority_vote)


train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])


train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
val_dataset = Dataset.from_pandas(val_df[["text", "label"]])

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 3200
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 800
    })
})


In [9]:

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"


In [10]:

!pip install -q transformers datasets torch scikit-learn

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

train_dataset_tokenized = dataset["train"].map(tokenize, batched=True)
val_dataset_tokenized = dataset["validation"].map(tokenize, batched=True)

train_dataset_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=val_dataset_tokenized,
    compute_metrics=compute_metrics,
)

print("🚀 Training DistilBERT on toxic comments...")
trainer.train()

print("\n✅ Evaluating model on validation set...")
metrics = trainer.evaluate()
print("Validation Metrics:", metrics)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


🚀 Training DistilBERT on toxic comments...


Step,Training Loss
100,0.5714
200,0.4923
300,0.4824
400,0.4987
500,0.3339
600,0.3205
700,0.3306
800,0.2995



✅ Evaluating model on validation set...


Validation Metrics: {'eval_loss': 0.5706495046615601, 'eval_accuracy': 0.8025, 'eval_f1': 0.5885416666666666, 'eval_runtime': 2.7776, 'eval_samples_per_second': 288.021, 'eval_steps_per_second': 18.001, 'epoch': 2.0}


In [11]:

import torch
import pandas as pd
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

test_df = pd.read_json("z639_assignment1_test.json", lines=True)

encodings = tokenizer(
    test_df["text"].tolist(),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

encodings = {k: v.to(device) for k, v in encodings.items()}

model.eval()
predictions = []
with torch.no_grad():
    for i in tqdm(range(0, len(test_df), 32)):
        batch = {k: v[i:i+32] for k, v in encodings.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

test_df["prediction"] = [True if p == 1 else False for p in predictions]
submission = test_df[["platform_id", "prediction"]]

filename = "krisha_patel-assignment1-prediction.csv"   # change name if needed
submission.to_csv(filename, index=False)

print(f"✅ Saved submission file: {filename}")
print(submission.head())



100%|██████████| 16/16 [00:01<00:00, 10.65it/s]

✅ Saved submission file: krisha_patel-assignment1-prediction.csv
                                         platform_id  prediction
0  UgxjV6HRpnD6FUmw8aV4AaABAg.9pH-CgX5yEH9pH7BMIfAz5       False
1                                1657052099564150784       False
2                                1662672469205958656       False
3                                1656508255454019587       False
4                                            jk1pm1m       False





In [12]:


import pandas as pd

test_df = pd.read_json("z639_assignment1_test.json", lines=True)

submission = pd.read_csv("krisha_patel-assignment1-prediction.csv")

submission = submission[["platform_id", "prediction"]]

submission["prediction"] = submission["prediction"].apply(lambda x: "true" if x else "false")

final_filename = "krisha_patel-assignment1-prediction.csv"
submission.to_csv(final_filename, index=False)

print(f"✅ Final submission file saved as: {final_filename}")
print(submission.head())



✅ Final submission file saved as: krisha_patel-assignment1-prediction.csv
                                         platform_id prediction
0  UgxjV6HRpnD6FUmw8aV4AaABAg.9pH-CgX5yEH9pH7BMIfAz5      false
1                                1657052099564150784      false
2                                1662672469205958656      false
3                                1656508255454019587      false
4                                            jk1pm1m      false


In [13]:
import joblib
joblib.dump(tfidf_pipe, f"krisha_patel_tfidf_model.joblib")
print("TF-IDF model saved as:", f"krisha_patel_tfidf_model.joblib")
files.download(f"krisha_patel_tfidf_model.joblib")


TF-IDF model saved as: krisha_patel_tfidf_model.joblib


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>