In [16]:
# Common
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Hugging Face
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader

# Scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [14]:
# Load data
test_df = pd.read_csv("../data/splits/test.csv")
test_df.dropna(inplace=True)
test_texts = test_df["clean_text"].tolist()
test_labels = test_df["label"].tolist()

In [15]:
# Load data for distilbert
test_df_bert = pd.read_csv("../data/splits/raw/test.csv")
test_df_bert.dropna(inplace=True)
test_texts_bert = test_df_bert["text"].tolist()
test_labels_bert = test_df_bert["label"].tolist()

## Logistic Regression

In [17]:
model_path = "../models/logistic_model.pkl"

logistic_pipeline = joblib.load(model_path)

logistic_preds = logistic_pipeline.predict(test_texts)

## Fine tuned distilbert

In [18]:
model_checkpoint = "cornualghost/tm4scam-distilbert"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Predictions

In [19]:
logistic_preds = logistic_pipeline.predict(test_texts)

In [None]:
# Tokenize
test_encodings = tokenizer(test_texts_bert, truncation=True, padding=True, return_tensors="pt")
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}

# Inference
with torch.no_grad():
    outputs = bert_model(**test_encodings)
    logits = outputs.logits
    bert_preds = torch.argmax(logits, dim=-1).cpu().numpy()
