In [None]:
!nvidia-smi

!pip install -q --upgrade \
  "transformers>=4.44.0" \
  "accelerate>=0.34.0" \
  "peft>=0.11.0" \
  "trl>=0.9.6" \
  datasets einops scipy evaluate huggingface_hub

import torch
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())

Mon Oct 27 21:01:44 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# ⬇️ Paste the EXACT system prompt you used in VS Code
SYSTEM_PROMPT = (
    "Task: Assign exactly one numeric label for the entire speech: "
    "-1 (negative), 0 (neutral), or 1 (positive). "
    "Evaluate sentiment toward any salient target (policies, parties, actors, outcomes). "
    "Do not restrict to a predefined topic. Output only the numeric label (no extra text). "
    "Decision rules: Evidence scan — check the opening, repeated points in the body, "
    "and conclusion for explicit evaluative statements (approval/support vs criticism/opposition). "
    "Dominance — decide by Coverage → Emphasis → Strength. "
    "If evaluatives exist and it’s tied, choose the side with stronger evaluatives. "
    "Neutral is valid: assign 0 only when no explicit evaluative statements are found after the scan. "
    "Do not infer sentiment from purely ceremonial, honorific, or procedural content. "
    "Return exactly one numeric label: -1, 0, or 1."
)
print(SYSTEM_PROMPT)

Task: Assign exactly one numeric label for the entire speech: -1 (negative), 0 (neutral), or 1 (positive). Evaluate sentiment toward any salient target (policies, parties, actors, outcomes). Do not restrict to a predefined topic. Output only the numeric label (no extra text). Decision rules: Evidence scan — check the opening, repeated points in the body, and conclusion for explicit evaluative statements (approval/support vs criticism/opposition). Dominance — decide by Coverage → Emphasis → Strength. If evaluatives exist and it’s tied, choose the side with stronger evaluatives. Neutral is valid: assign 0 only when no explicit evaluative statements are found after the scan. Do not infer sentiment from purely ceremonial, honorific, or procedural content. Return exactly one numeric label: -1, 0, or 1.


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

assert torch.cuda.is_available(), "No GPU detected. Runtime → Change runtime type → GPU."

# Base model to load for both training & evaluation
base_model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# FP16 on T4 (sm75). Use BF16 if on A100/L4
dtype = torch.float16

# --- Load tokenizer + set truncation BEFORE any evaluation ---
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
tokenizer.model_max_length = 512           #  ensure consistent truncation
tokenizer.padding_side = "right"           #  decoder-only safe defaults
tokenizer.truncation_side = "right"        #  truncate end of sequence only

# --- Load base Qwen model ---
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
)

# Memory + speed configurations
model.config.use_cache = False
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("Loaded:", base_model_name, "| GPU:", torch.cuda.get_device_name(0))
print("Tokenizer configured → model_max_length=512 (right-truncation, right-padding)")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Loaded: Qwen/Qwen2.5-1.5B-Instruct | GPU: Tesla T4
Tokenizer configured → model_max_length=512 (right-truncation, right-padding)


In [None]:
#  Mount Drive first
from google.colab import drive
drive.mount('/content/drive')  # add force_remount=True if needed

import os, pandas as pd

#  Set paths to where your merged/split CSVs are stored in Drive
train_csv_path = "/content/drive/MyDrive/Congress Data/train_subset_800.csv"
valid_csv_path = "/content/drive/MyDrive/Congress Data/validation_subset_301.csv"

#  Validate files
assert os.path.exists(train_csv_path), f"Not found: {train_csv_path}"
assert os.path.exists(valid_csv_path), f"Not found: {valid_csv_path}"

#  Load CSVs
train_df = pd.read_csv(train_csv_path)
valid_df = pd.read_csv(valid_csv_path)

#  Column and label checks
required = {"speech", "LABEL"}
assert required.issubset(train_df.columns), f"Train missing columns: {required - set(train_df.columns)}"
assert required.issubset(valid_df.columns), f"Valid missing columns: {required - set(valid_df.columns)}"

train_df["LABEL"] = train_df["LABEL"].astype(int)
valid_df["LABEL"] = valid_df["LABEL"].astype(int)

allowed = {-1, 0, 1}
bad_train = set(train_df["LABEL"].unique()) - allowed
bad_valid = set(valid_df["LABEL"].unique()) - allowed
assert not bad_train and not bad_valid, f"Labels must be in {-1,0,1}. Got train {bad_train}, valid {bad_valid}"

print(" Successfully loaded data from Google Drive!")
print("Train shape:", train_df.shape, "| Valid shape:", valid_df.shape)
print("Train label counts:\n", train_df["LABEL"].value_counts().sort_index())
print("Valid label counts:\n", valid_df["LABEL"].value_counts().sort_index())

train_df.head(3)

Mounted at /content/drive
✅ Successfully loaded data from Google Drive!
Train shape: (800, 23) | Valid shape: (301, 23)
Train label counts:
 LABEL
-1    284
 0     63
 1    453
Name: count, dtype: int64
Valid label counts:
 LABEL
-1     62
 0     60
 1    179
Name: count, dtype: int64


Unnamed: 0,speech_id,speech,chamber,date,number_within_file,speaker,first_name,last_name,state,gender,...,char_count,word_count,speakerid,party,district,nonvoting,congress,topic,speaker_age,LABEL
0,1120043300,I ask unanimous consent that when the Senate c...,S,2011-07-12,458.0,BILL NELSON,Unknown,NELSON,FL,M,...,745.0,130.0,112118211.0,D,,voting,112.0,budget,,0
1,1100134818,I want to thank Chairman MILLER and Chairman H...,H,2008-02-07,454.0,TIMOTHY BISHOP,Unknown,BISHOP,NY,M,...,2082.0,349.0,110117510.0,D,1.0,voting,110.0,business,,1
2,1090063525,Mr. President. I strongly support the Foundati...,S,2005-07-27,925.0,EDWARD KENNEDY,Unknown,KENNEDY,MA,M,...,2142.0,345.0,109114941.0,D,,voting,109.0,health,,1


In [None]:
import pandas as pd
import numpy as np

# Make sure text is string, no NaNs
train_df["speech"] = train_df["speech"].astype(str).replace({"nan": ""})
valid_df["speech"] = valid_df["speech"].astype(str).replace({"nan": ""})

In [None]:
from datasets import Dataset, DatasetDict

def to_messages(ex):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": str(ex["speech"])},
            {"role": "assistant", "content": str(int(ex["LABEL"]))},
        ]
    }

train_ds = Dataset.from_pandas(train_df).map(lambda ex: to_messages(ex))
valid_ds = Dataset.from_pandas(valid_df).map(lambda ex: to_messages(ex))
ds = DatasetDict({"train": train_ds, "validation": valid_ds})

print("Train/Val sizes:", len(ds["train"]), len(ds["validation"]))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Train/Val sizes: 800 301


In [None]:
# Base-model-only prediction helper (no LoRA attach)
import re, math, torch

def _to_str(x):
    if x is None: return ""
    if isinstance(x, float) and math.isnan(x): return ""
    return str(x)

def predict_label(speech, system_prompt: str) -> str:
    """
    Run the CURRENT model (base only) and return '-1', '0', or '1' as text.
    No adapter loading, no side effects.
    """
    speech_txt = _to_str(speech)
    messages = [
        {"role": "system", "content": _to_str(system_prompt)},
        {"role": "user", "content": speech_txt},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.inference_mode():
        out = model.generate(
            input_ids=input_ids,
            max_new_tokens=4,
            do_sample=False,                    # deterministic
            pad_token_id=tokenizer.eos_token_id
        )

    text = tokenizer.decode(out[0][input_ids.shape[-1]:], skip_special_tokens=True).strip()
    # extract only -1/0/1
    m = re.search(r"(?<!\d)-?\b[01]\b", text)
    return m.group(0) if m else (text.split()[0] if text else "0")

# Optional: quick confirmation we’re NOT using a PEFT/LoRA-wrapped model
print("Evaluating BASE model:", not hasattr(model, "peft_config"))

Evaluating BASE model: True


In [None]:
import re, math, numpy as np, pandas as pd, torch
from tqdm.auto import tqdm

LABELS = [-1, 0, 1]
L2I = {l:i for i,l in enumerate(LABELS)}
I2L = {i:l for l,i in L2I.items()}

def _norm_pred(p: str) -> int:
    """Coerce model text -> {-1,0,1}; default to 0 if unparseable."""
    p = str(p).strip()
    m = re.search(r"(?<!\d)-?\b[01]\b", p)
    if m:
        try:
            v = int(m.group(0))
            return v if v in L2I else 0
        except:
            pass
    try:
        v = int(p)
        return v if v in L2I else 0
    except:
        return 0

def _to_str(x):
    if x is None: return ""
    if isinstance(x, float) and math.isnan(x): return ""
    return str(x)

# Collect predictions/labels + build row-level results for display
y_true, y_pred = [], []
results = []

for row in tqdm(valid_df.to_dict("records")):
    gold = int(row["LABEL"])
    speech_txt = _to_str(row.get("speech", ""))
    pred_text = predict_label(speech_txt, SYSTEM_PROMPT)
    pred = _norm_pred(pred_text)

    y_true.append(gold)
    y_pred.append(pred)

    results.append({
        "speech": speech_txt,
        "model_output_text": pred_text,  # raw model output
        "pred_label": pred,              # normalized to {-1,0,1}
        "true_label": gold,
        "correct": (pred == gold),
    })

# ---- Metrics & Confusion Matrix ----
cm = np.zeros((len(LABELS), len(LABELS)), dtype=int)  # rows=actual, cols=predicted
for t, p in zip(y_true, y_pred):
    cm[L2I[t], L2I[p]] += 1

support = cm.sum(axis=1)
tp = np.diag(cm)
fp = cm.sum(axis=0) - tp
fn = cm.sum(axis=1) - tp

precision = np.divide(tp, tp + fp, out=np.zeros_like(tp, dtype=float), where=(tp+fp)!=0)
recall    = np.divide(tp, tp + fn, out=np.zeros_like(tp, dtype=float), where=(tp+fn)!=0)
f1        = np.divide(2*precision*recall, precision+recall, out=np.zeros_like(tp, dtype=float), where=(precision+recall)!=0)

accuracy   = tp.sum() / max(1, support.sum())
macro_p    = precision.mean()
macro_r    = recall.mean()
macro_f1   = f1.mean()
weighted_f1= np.average(f1, weights=np.maximum(support, 1)) if support.sum() > 0 else 0.0

cm_df = pd.DataFrame(cm, index=[f"actual {l}" for l in LABELS], columns=[f"pred {l}" for l in LABELS])
report_df = pd.DataFrame({
    "label": LABELS,
    "precision": precision,
    "recall": recall,
    "f1": f1,
    "support": support
})

print(f"Accuracy: {accuracy:.4f}  ({tp.sum()}/{int(support.sum())})")
print(f"Macro F1: {macro_f1:.4f} | Weighted F1: {weighted_f1:.4f}")
print("\nPer-class metrics:")
display(report_df)

print("\nConfusion matrix (rows=actual, cols=predicted):")
display(cm_df)

# ---- Inline results table (no saving) ----
results_df = pd.DataFrame(results)
print("\nFirst 30 validation rows with predictions:")
display(results_df.head(30))

# If you want only misclassifications, run:
# display(results_df[~results_df['correct']].head(30))

  0%|          | 0/301 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Token indices sequence length is longer than the specified maximum sequence length for this model (2412 > 512). Running this sequence through the model will result in indexing errors


Accuracy: 0.5980  (180/301)
Macro F1: 0.2988 | Weighted F1: 0.4749

Per-class metrics:


Unnamed: 0,label,precision,recall,f1,support
0,-1,0.428571,0.048387,0.086957,62
1,0,0.4,0.033333,0.061538,60
2,1,0.605536,0.977654,0.747863,179



Confusion matrix (rows=actual, cols=predicted):


Unnamed: 0,pred -1,pred 0,pred 1
actual -1,3,2,57
actual 0,1,2,57
actual 1,3,1,175



First 30 validation rows with predictions:


Unnamed: 0,speech,model_output_text,pred_label,true_label,correct
0,,0,0,-1,False
1,Mr. President. I have sought recognition to ex...,1,1,1,True
2,Mr. President. first. I commend and thank my c...,1,1,1,True
3,"Mr. WOODALL. Mr. Speaker, when we have a natur...",1,1,1,True
4,Mr. President. I have sought recognition today...,1,1,0,False
5,Mr. Speaker. I rise today in support of H.R. 7...,1,1,1,True
6,"Mr. RODNEY DAVIS of Illinois. Madam Chairman, ...",0,0,-1,False
7,Mr. President. today I am reintroducing. with ...,1,1,1,True
8,Mr. President. it has been more than a decade ...,1,1,1,True
9,Mr. Speaker. we have coming before us pretty s...,1,1,-1,False


In [None]:
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

#  Save directly to Drive
ADAPTER_DIR = "/content/drive/MyDrive/Congress/Data/qwen-sentiment-lora"

os.makedirs(ADAPTER_DIR, exist_ok=True)

# --- LoRA config ---
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

# --- tokenizer limits for both base + finetuned ---
tokenizer.model_max_length = 512
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

# --- Training config to Drive ---
sft_config = SFTConfig(
    output_dir=ADAPTER_DIR,          #  save here
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=True,
    do_eval=True,
    packing=False,
    report_to=[],
)

def format_example(ex):
    return ex["messages"]

trainer = SFTTrainer(
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    formatting_func=format_example,
    peft_config=peft_config,
    args=sft_config,
)

trainer.train()

#  Save into Drive
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

print(f" Adapter + tokenizer saved to {ADAPTER_DIR}")

Applying formatting function to train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/301 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/301 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/301 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,2.4778
20,1.8702
30,1.8093
40,1.767
50,1.8063
60,1.7579
70,1.7434
80,1.7718
90,1.7304
100,1.7759


✅ Adapter + tokenizer saved to /content/drive/MyDrive/Congress/Data/qwen-sentiment-lora


In [None]:
# ==== Evaluate FINETUNED model (same behavior as Code 1, but with LoRA adapter) ====

import os, re, math, numpy as np, pandas as pd, torch
from tqdm.auto import tqdm
from peft import PeftModel
from transformers import AutoModelForCausalLM

# Where your adapter was saved:
ADAPTER_DIR = "/content/drive/MyDrive/Congress/Data/qwen-sentiment-lora"

# 1) Load a clean base model to attach the adapter (so it's truly base+adapter)
#    If you already have base_model_name defined, we'll use it; otherwise try to infer.
if 'base_model_name' in globals():
    base_name = base_model_name
else:
    base_name = getattr(model.config, "_name_or_path", None)
    assert base_name is not None, (
        "Couldn't infer base_model_name. Set base_model_name = 'Qwen/Qwen2.5-1.5B-Instruct' (or your base) and re-run."
    )

dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16
base_for_ft = AutoModelForCausalLM.from_pretrained(
    base_name, device_map="auto", torch_dtype=dtype, low_cpu_mem_usage=True
)
if hasattr(base_for_ft, "gradient_checkpointing_enable"):
    base_for_ft.gradient_checkpointing_enable()

# 2) Attach the LoRA adapter
assert os.path.isdir(ADAPTER_DIR), f"Adapter dir not found: {ADAPTER_DIR}"
model_ft = PeftModel.from_pretrained(base_for_ft, ADAPTER_DIR)
model_ft.eval()

# 3) Make tokenizer behavior match Code 1
tokenizer.model_max_length = 512
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

# --- Helpers identical to Code 1 ---
LABELS = [-1, 0, 1]
L2I = {l:i for i,l in enumerate(LABELS)}

def _to_str(x):
    if x is None: return ""
    if isinstance(x, float) and math.isnan(x): return ""
    return str(x)

def _norm_pred(p: str) -> int:
    p = str(p).strip()
    m = re.search(r"(?<!\d)-?\b[01]\b", p)
    if m:
        try:
            v = int(m.group(0));
            return v if v in L2I else 0
        except:
            pass
    try:
        v = int(p);
        return v if v in L2I else 0
    except:
        return 0

def predict_label_finetuned(speech, system_prompt: str) -> str:
    """Same as Code 1, but uses the finetuned (base+adapter) model."""
    speech_txt = _to_str(speech)
    messages = [
        {"role": "system", "content": _to_str(system_prompt)},
        {"role": "user", "content": speech_txt},
    ]
    input_ids = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(model_ft.device)

    with torch.inference_mode():
        out = model_ft.generate(
            input_ids=input_ids,
            max_new_tokens=4,
            do_sample=False,                      # deterministic, same as Code 1
            pad_token_id=tokenizer.eos_token_id,
        )

    text = tokenizer.decode(out[0][input_ids.shape[-1]:], skip_special_tokens=True).strip()
    m = re.search(r"(?<!\\d)-?\\b[01]\\b", text)
    return m.group(0) if m else (text.split()[0] if text else "0")

# 4) Run evaluation (same structure/output as Code 1, but on finetuned model)
y_true, y_pred, results = [], [], []
for row in tqdm(valid_df.to_dict("records"), desc="Predicting (finetuned)"):
    gold = int(row["LABEL"])
    speech_txt = _to_str(row.get("speech", ""))
    pred_text = predict_label_finetuned(speech_txt, SYSTEM_PROMPT)
    pred = _norm_pred(pred_text)
    y_true.append(gold); y_pred.append(pred)
    results.append({
        "speech": speech_txt,
        "model_output_text": pred_text,
        "pred_label": pred,
        "true_label": gold,
        "correct": (pred == gold),
    })

# Metrics & confusion matrix (unchanged from Code 1)
cm = np.zeros((len(LABELS), len(LABELS)), dtype=int)
for t, p in zip(y_true, y_pred):
    cm[L2I[t], L2I[p]] += 1

support = cm.sum(axis=1)
tp = np.diag(cm)
fp = cm.sum(axis=0) - tp
fn = cm.sum(axis=1) - tp

precision = np.divide(tp, tp + fp, out=np.zeros_like(tp, dtype=float), where=(tp+fp)!=0)
recall    = np.divide(tp, tp + fn, out=np.zeros_like(tp, dtype=float), where=(tp+fn)!=0)
f1        = np.divide(2*precision*recall, precision+recall, out=np.zeros_like(tp, dtype=float), where=(precision+recall)!=0)

accuracy   = tp.sum() / max(1, support.sum())
macro_f1   = f1.mean()
weighted_f1= np.average(f1, weights=np.maximum(support, 1)) if support.sum() > 0 else 0.0

cm_df = pd.DataFrame(cm, index=[f"actual {l}" for l in LABELS], columns=[f"pred {l}" for l in LABELS])
report_df = pd.DataFrame({"label": LABELS, "precision": precision, "recall": recall, "f1": f1, "support": support})

print(f"FINETUNED — Accuracy: {accuracy:.4f}  ({tp.sum()}/{int(support.sum())})")
print(f"FINETUNED — Macro F1: {macro_f1:.4f} | Weighted F1: {weighted_f1:.4f}")
print("\nPer-class metrics:")
display(report_df)

print("\nConfusion matrix (rows=actual, cols=predicted):")
display(cm_df)

results_df = pd.DataFrame(results)
print("\nFirst 30 validation rows (finetuned):")
display(results_df.head(30))

Predicting (finetuned):   0%|          | 0/301 [00:00<?, ?it/s]

FINETUNED — Accuracy: 0.3422  (103/301)
FINETUNED — Macro F1: 0.2614 | Weighted F1: 0.2904

Per-class metrics:


Unnamed: 0,label,precision,recall,f1,support
0,-1,0.269406,0.951613,0.419929,62
1,0,0.5,0.016667,0.032258,60
2,1,0.5375,0.240223,0.332046,179



Confusion matrix (rows=actual, cols=predicted):


Unnamed: 0,pred -1,pred 0,pred 1
actual -1,59,0,3
actual 0,25,1,34
actual 1,135,1,43



First 30 validation rows (finetuned):


Unnamed: 0,speech,model_output_text,pred_label,true_label,correct
0,,-1,-1,-1,True
1,Mr. President. I have sought recognition to ex...,-1,-1,1,False
2,Mr. President. first. I commend and thank my c...,-1,-1,1,False
3,"Mr. WOODALL. Mr. Speaker, when we have a natur...",-1,-1,1,False
4,Mr. President. I have sought recognition today...,1,1,0,False
5,Mr. Speaker. I rise today in support of H.R. 7...,-1,-1,1,False
6,"Mr. RODNEY DAVIS of Illinois. Madam Chairman, ...",-1,-1,-1,True
7,Mr. President. today I am reintroducing. with ...,-1,-1,1,False
8,Mr. President. it has been more than a decade ...,-1,-1,1,False
9,Mr. Speaker. we have coming before us pretty s...,-1,-1,-1,True


In [None]:
# GPU based model

import torch, re

#  Config — update if needed
ADAPTER_DIR = "/content/drive/MyDrive/Congress/Data/qwen-sentiment-lora"
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

#  Same system prompt used for training
SYSTEM_PROMPT = (
    "Task: Assign exactly one numeric label for the entire speech: "
    "-1 (negative), 0 (neutral), or 1 (positive). "
    "Evaluate sentiment toward any salient target (policies, parties, actors, outcomes). "
    "Do not restrict to a predefined topic. Output only the numeric label (no extra text). "
    "Decision rules: Evidence scan — check the opening, repeated points in the body, "
    "and conclusion for explicit evaluative statements (approval/support vs criticism/opposition). "
    "Dominance — decide by Coverage → Emphasis → Strength. "
    "If evaluatives exist and it’s tied, choose the side with stronger evaluatives. "
    "Neutral is valid: assign 0 only when no explicit evaluative statements are found after the scan. "
    "Do not infer sentiment from purely ceremonial, honorific, or procedural content. "
    "Return exactly one numeric label: -1, 0, or 1."
)

#  Hardware dtype choice
dtype = torch.bfloat16 if torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16

#  Load tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
tokenizer.model_max_length = 512
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

#  Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
)
base_model.eval()

#  Load finetuned (LoRA) model
model_ft = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
)
model_ft = PeftModel.from_pretrained(model_ft, ADAPTER_DIR)
model_ft.eval()

#  Label normalization to -1 / 0 / 1
def _norm_pred(p: str) -> int:
    p = str(p).strip()
    m = re.search(r"(?<!\d)-?\b[01]\b", p)
    if m:
        try:
            v = int(m.group(0))
            return v if v in (-1,0,1) else 0
        except:
            pass
    try:
        v = int(p)
        return v if v in (-1,0,1) else 0
    except:
        return 0

#  Universal prediction function (base or finetuned)
def predict_clean(model_obj, sentence: str):
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": sentence},
    ]
    input_ids = tokenizer.apply_chat_template(
        msgs, add_generation_prompt=True, return_tensors="pt"
    ).to(model_obj.device)

    with torch.inference_mode():
        out = model_obj.generate(
            input_ids=input_ids,
            max_new_tokens=4,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    txt = tokenizer.decode(out[0][input_ids.shape[-1]:], skip_special_tokens=True).strip()
    return _norm_pred(txt), txt

#  User input + compare results
sentence = input("Enter a speech or sentence for sentiment prediction:\n")

base_pred, base_raw = predict_clean(base_model, sentence)
ft_pred, ft_raw = predict_clean(model_ft, sentence)

print("\n📌 Input:", sentence)
print(f"\n🔹 BASE Model     → {base_pred}   (raw: '{base_raw}')")
print(f"🔸 FINETUNED Model → {ft_pred}   (raw: '{ft_raw}')")

OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 16.12 MiB is free. Process 2595 has 14.72 GiB memory in use. Of the allocated memory 14.24 GiB is allocated by PyTorch, and 351.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# CPU based model
import torch, re, os
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

#  Path to your saved adapter (Drive)
ADAPTER_DIR = "/content/drive/MyDrive/Congress/Data/qwen-sentiment-lora"
BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"

#  Same system prompt used for training
SYSTEM_PROMPT = (
    "Task: Assign exactly one numeric label for the entire speech: "
    "-1 (negative), 0 (neutral), or 1 (positive). "
    "Provide only the number."
)

#  Detect CPU vs GPU
on_gpu = torch.cuda.is_available()
device = torch.device("cuda" if on_gpu else "cpu")
print("Running on:", device)

#  Choose dtype safely
if on_gpu:
    supports_bf16 = torch.cuda.get_device_capability(0)[0] >= 8
    dtype = torch.bfloat16 if supports_bf16 else torch.float16
else:
    dtype = torch.float32  #  CPU-safe

#  Load tokenizer (works everywhere)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
tokenizer.model_max_length = 512
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

#  Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
)

base_model.to(device)
base_model.eval()

#  Load finetuned model (base + LoRA adapter)
model_ft = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
)
model_ft = PeftModel.from_pretrained(model_ft, ADAPTER_DIR)

model_ft.to(device)
model_ft.eval()

#  Label normalization (-1/0/1)
def _norm_pred(p: str) -> int:
    p = str(p).strip()
    m = re.search(r"(?<!\d)-?\b[01]\b", p)
    if m:
        try:
            v = int(m.group(0))
            return v if v in (-1,0,1) else 0
        except:
            pass
    try:
        v = int(p)
        return v if v in (-1,0,1) else 0
    except:
        return 0

#  Universal prediction function
def predict_clean(model_obj, sentence: str):
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": sentence},
    ]
    inputs = tokenizer.apply_chat_template(
        msgs, add_generation_prompt=True, return_tensors="pt"
    ).to(device)

    with torch.inference_mode():
        out = model_obj.generate(
            input_ids=inputs,
            max_new_tokens=4,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    txt = tokenizer.decode(out[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
    return _norm_pred(txt), txt

#  Input + display predictions
sentence = input("Enter a speech or sentence for sentiment prediction:\n")

base_pred, base_raw = predict_clean(base_model, sentence)
ft_pred, ft_raw = predict_clean(model_ft, sentence)

print("\n📌 Input:", sentence)
print(f"\n🔹 BASE Model     → {base_pred}   (raw: '{base_raw}')")
print(f"🔸 FINETUNED Model → {ft_pred}   (raw: '{ft_raw}')")

Running on: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]