## Sentence Transformer

In [14]:
import pandas as pd
data = pd.read_csv('Project1-ClassificationDataset.csv')

In [15]:
full_text = list(data['full_text'])

In [16]:
import torch
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MAX_LEN = 128
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tok = AutoTokenizer.from_pretrained(MODEL_NAME)
enc = AutoModel.from_pretrained(MODEL_NAME).to(device)
enc.eval()

@torch.no_grad()
def encode_texts(texts):
    """
    texts: list[str]
    returns: torch.Tensor [N, hidden_dim] on CPU
    """
    all_vecs = []
    for i in range(0, len(texts), BATCH_SIZE):
        batch = texts[i:i+BATCH_SIZE]
        inputs = tok(
            batch,
            truncation=True,
            max_length=MAX_LEN,
            padding=True,
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        out = enc(**inputs)  # out.last_hidden_state: [B, L, H]

        # mean pooling (mask padding)
        mask = inputs["attention_mask"].unsqueeze(-1).type_as(out.last_hidden_state)
        pooled = (out.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

        all_vecs.append(pooled.cpu())

    return torch.cat(all_vecs, dim=0)

# Example
vecs = encode_texts(["hello world", "UCLA ECE 219"])
print(vecs.shape)  # (2, hidden_dim)

# or encode_texts(full_text)

torch.Size([2, 384])


In [17]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("paws", "labeled_final")

In [18]:
sent1 = ds['train']['sentence1'][0]
sent2 = ds['train']['sentence2'][0]

u,v = encode_texts([sent1, sent2])

In [19]:
# Compute the cosine similarity as a column tensor for proper concatenation
cos_sim = torch.cosine_similarity(u, v, dim=0, eps=1e-8).unsqueeze(0)
final_vec = torch.cat([u, v, torch.abs(u-v), cos_sim], dim=0).unsqueeze(0)
final_vec.shape, cos_sim.shape, cos_sim

(torch.Size([1, 1153]), torch.Size([1]), tensor([0.9926]))

1153 dimensional vector is used to represent the sentence pair which has a single label (0 or 1) assigned to it.

## distillbert







In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

MODEL = "distilbert-base-uncased"
MAX_LEN = 128

ds = load_dataset("paws", "labeled_final")

tok = AutoTokenizer.from_pretrained(MODEL)

def tok_fn(batch):
    return tok(batch["sentence1"], batch["sentence2"],
               truncation=True, max_length=MAX_LEN, padding="max_length")

train = ds["train"].shuffle(seed=42).select(range(10000)).map(tok_fn, batched=True)
val   = ds["validation"].shuffle(seed=42).select(range(2000)).map(tok_fn, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

args = TrainingArguments(
    output_dir="tmp_ckpt",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    eval_strategy="epoch",
    save_strategy="no",
    report_to="none",
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(model=model, args=args, train_dataset=train, eval_dataset=val, tokenizer=tok)
trainer.train()

# pred_out = trainer.predict(test_tok)

## Qwen3-4B

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
# Qwen starter: prompt -> generated text (deterministic) with generation tokens added
# !pip -q install -U transformers accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"  # Colab-free friendly
MAX_NEW_TOKENS = 32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True
)
model.eval()

GEN_KWARGS = dict(
    do_sample=False,   # deterministic
    num_beams=1,
    max_new_tokens=MAX_NEW_TOKENS
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
# Qwen uses special tokens to wrap prompts/outputs: <|im_start|> and <|im_end|>
def add_generation_tokens(prompt: str) -> str:
    # Always wrap the prompt in user/assistant tokens for Qwen
    prompt = prompt.strip()
    return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

def generate(prompt: str) -> str:
    prompt_with_tokens = add_generation_tokens(prompt)
    inputs = tokenizer(prompt_with_tokens, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**inputs, **GEN_KWARGS)
    # Decode only the newly generated tokens (skip the input prompt)
    generated_ids = out[0][inputs.input_ids.shape[1]:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return response.strip()

In [12]:
prompt = "Answer with one word (yes/no): Is 'cat' an animal?\nAnswer:"
print(generate(prompt))

yes


In [28]:
def make_prompt(sent1, sent2):
    prompt = f"""I have two sentences, and I want to know if they are paraphrases of each other.
    Sentence 1: {sent1}
    Sentence 2: {sent2}
    Answer:"""
    return prompt

def get_answer(sent1, sent2):
    prompt = make_prompt(sent1, sent2)
    return generate(prompt)

In [29]:
ds['train'][0]

{'id': 1,
 'sentence1': 'In Paris , in October 1560 , he secretly met the English ambassador , Nicolas Throckmorton , asking him for a passport to return to England through Scotland .',
 'sentence2': 'In October 1560 , he secretly met with the English ambassador , Nicolas Throckmorton , in Paris , and asked him for a passport to return to Scotland through England .',
 'label': 0}

In [30]:
get_answer(sent1, sent2), ds['train'][0]['label']

('No.\n\nExplanation:  \nWhile the two sentences are similar in structure and content, they are **not** paraphrases of each other because they convey **different information',
 0)

In [33]:
index = 11
get_answer(ds['train']['sentence1'][index], ds['train']['sentence2'][index]), ds['train']['label'][index]

('Yes.\n\nThe two sentences are paraphrases of each other. They convey the same meaning with only minor differences in capitalization and word choice ("Metaphysical Literature',
 1)

In [34]:
def make_prompt(sent1, sent2):
    prompt = f"""I have two sentences, and I want to know if they are paraphrases of each other. Please answer with one word (yes/no).
    Sentence 1: {sent1}
    Sentence 2: {sent2}
    Answer:"""
    return prompt

index = 11
get_answer(ds['train']['sentence1'][index], ds['train']['sentence2'][index]), ds['train']['label'][index]

('yes', 1)

Notice the effect of the prompt on the answer. Asking it to answer with one word (yes/no) makes it more likely to do so. But then it doesnt get to explain its answer. Try different prompts and realize what works best for our case.