# Import Dependencies

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import datetime
import time
import torch

from tqdm import tqdm
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load Data

In [5]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv("train_E6oV3lV.csv")
df = df[["label", "tweet"]]
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,0,bihday your majesty
3,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,0,factsguide: society now #motivation


# Train Test Split

In [6]:
train_df, val_df = train_test_split(df, test_size = 0.2, stratify=df['label'], random_state=42)

# Convert to Hugging Face Dataset

In [7]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# Load Tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

# Tokenization Function

In [9]:
def tokenize_function(df):
    return tokenizer(
        df['tweet'],
        padding = 'max_length',
        truncation = True,
        max_length = 128
    )

# Tokenize Datasets

In [10]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(['tweet'])
val_dataset = val_dataset.remove_columns(['tweet'])

train_dataset.set_format('torch')
val_dataset.set_format('torch')

Map: 100%|██████████████████████████████████████████████████████████████| 25569/25569 [00:02<00:00, 9054.52 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 6393/6393 [00:04<00:00, 1320.60 examples/s]


# Load Model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base",
    num_labels=2
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluation Metrics

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy" : acc,
        "precision" : precision,
        "recall" : recall,
        "f1" : f1
    }

# Training Arguments

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

# Trainer

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

# Train the Model

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0857,0.082251,0.977945,0.913747,0.756696,0.827839
2,0.0458,0.085128,0.980135,0.886747,0.821429,0.852839
3,0.0337,0.090002,0.981386,0.904177,0.821429,0.860819


TrainOutput(global_step=4797, training_loss=0.07530913592528024, metrics={'train_runtime': 3100.1568, 'train_samples_per_second': 24.743, 'train_steps_per_second': 1.547, 'total_flos': 2540294187176448.0, 'train_loss': 0.07530913592528024, 'epoch': 3.0})

# Evaluate

In [16]:
trainer.evaluate()

{'eval_loss': 0.090001679956913,
 'eval_accuracy': 0.9813858908180823,
 'eval_precision': 0.9041769041769042,
 'eval_recall': 0.8214285714285714,
 'eval_f1': 0.8608187134502924,
 'eval_runtime': 82.9897,
 'eval_samples_per_second': 77.034,
 'eval_steps_per_second': 4.82,
 'epoch': 3.0}

# Save the Model

In [19]:
model.save_pretrained('model_roberta')