# Youtube Sentiment Analysis
MORS 2025 Competition

CPT Jonathan Dencker

CPT John McCormick

In [4]:
import os, sys, json, uuid, hashlib, time, platform, subprocess, random
from datetime import datetime
from pathlib import Path
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, LogitsProcessor
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"   # silence TF/XLA info/warnings
os.environ["JAX_PLATFORM_NAME"] = "cpu"    # keep JAX off the GPU, if present
os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

def set_seed(seed: int = 42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)
RUN_ID = uuid.uuid4().hex[:8]
OUT_DIR = Path(f"/kaggle/working/run_{RUN_ID}")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_id="Qwen/Qwen2.5-7B-Instruct",
#     local_dir="/kaggle/working/models/Qwen2.5-7B-Instruct"
# )



Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

'/kaggle/working/models/Qwen2.5-7B-Instruct'

In [5]:
# Load Model
MODEL_PATH="/kaggle/working/models/Qwen2.5-7B-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16 if device == "cuda" and torch.cuda.is_bf16_supported() else torch.float16

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True, padding_side="left")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch_dtype,
    device_map="auto" if device == "cuda" else None
)

model.config.pad_token_id = tokenizer.pad_token_id

model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [6]:
# Prompt Template
LABELS = ["for", "against", "neutral"]
LABEL_SET = set(LABELS)

SYSTEM_MSG = "You are a precise classifier. Only output valid JSON with two keys and no extra text."
USER_TMPL = """Classify the following YouTube comment for two stances.
Return ONLY JSON: {{"stance_toward_army": "...", "stance_toward_video": "..."}}.
Allowed values: "for", "against", "neutral".

Comment:
<<<{text}>>>
"""

def build_prompt(comment: str) -> str:
    # Llama/Qwen/Mistral instruction-style prompt
    return (
        f"<|begin_of_text|><|start_header_id|>system<|end_header_id|> \
            \n{SYSTEM_MSG} \
            \n<|eot_id|><|start_header_id|>user<|end_header_id|> \
            \n{USER_TMPL.format(text=comment)} \
            \n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
    )

gen_cfg = GenerationConfig(
    max_new_tokens=64,     # small: JSON with two words
    top_p=1.0,
    do_sample=False,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)


def classify_batch(comments):
    prompts = [build_prompt(c) for c in comments]
    encoding = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    # unpacks tokenizer encoding.keys() -> dict_keys(['input_ids', 'attention_mask'])
    enc = {k: v.to(model.device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model.generate(
            **enc,
            generation_config=gen_cfg
        )
        
    # left padding -> slice using actual (non-padded) lengths
    input_lengths = enc["attention_mask"].sum(dim=1)

    decoded = [
        tokenizer.decode(
            outputs[i, int(input_lengths[i].item()):],
            skip_special_tokens=True
        )
        for i in range(outputs.size(0))
    ]
    return decoded


def parse_json_safe(s: str):
    # Extract first JSON block
    start = s.find("{")
    end = s.rfind("}")
    if start != -1 and end != -1 and end > start:
        frag = s[start:end+1]
        try:
            obj = json.loads(frag)
            ms = str(obj.get("stance_toward_army","")).strip().lower()
            vs = str(obj.get("stance_toward_video","")).strip().lower()
            # fallback mapping in case model outputs synonyms
            syn = {"pro":"for","support":"for","anti":"against","oppose":"against","opposed":"against","neutrality":"neutral"}
            ms = syn.get(ms, ms)
            vs = syn.get(vs, vs)
            if ms not in LABEL_SET: ms = "neutral"
            if vs not in LABEL_SET: vs = "neutral"
            return {"stance_toward_army": ms, "stance_toward_video": vs, "raw": frag}
        except Exception:
            pass
    return {"stance_toward_army":"neutral","stance_toward_video":"neutral","raw": s[:2000]}

In [7]:
BATCH = 16
results = []

df = pd.read_csv("/kaggle/input/2025-mors-data-challenge/train.csv")
df = df.head(32)

for i in tqdm(range(0, len(df), BATCH), desc="Batches Processed:", position=0):
    chunk = df.iloc[i:i+BATCH]
    outs = []
    for comment in tqdm(chunk["comment"].tolist(), desc=f"Batch {i/BATCH+1}", leave=False, position=1):
        outs.extend(classify_batch([comment]))
        
    parsed = list(map(parse_json_safe, outs))
    
    for uid, p in zip(chunk["id"], parsed):
        results.append({
            "id": uid,
            "stance_toward_army": p["stance_toward_army"],
            "stance_toward_video": p["stance_toward_video"],
            "raw_model_out": p["raw"]
        })

Batches Processed::   0%|          | 0/2 [00:00<?, ?it/s]

Batch 1.0:   0%|          | 0/16 [00:00<?, ?it/s]

Batch 2.0:   0%|          | 0/16 [00:00<?, ?it/s]

In [8]:
preds_with_raw_output = pd.DataFrame(results)
preds_for_submission = preds_with_raw_output[["id", "stance_toward_army", "stance_toward_video"]]
preds_with_raw_output.to_parquet(OUT_DIR / "predictions.parquet", index=False)
preds_for_submission.to_csv(OUT_DIR / "submission.csv", index=False)

print("Saved:")
print(" - Full predictions with raw model output: ", OUT_DIR / "predictions.parquet")
print(" - Clean submission file: ", OUT_DIR / "submission.csv")

Saved:
 - Full predictions with raw model output:  /kaggle/working/run_7aa009b2/predictions.parquet
 - Clean submission file:  /kaggle/working/run_7aa009b2/submission.csv


In [9]:
print(preds_for_submission)

                                                   id stance_toward_army  \
0                          UgzgltD6aB_xznJOGPp4AaABAg            neutral   
1                          Ugzoj2u6j6yRXAH9-PV4AaABAg            neutral   
2                          UgycFd1HUYGSHfIhzDB4AaABAg            against   
3                          Ugzi6_ly_QKkYwBMjc54AaABAg                for   
4                          UgwHIvTiNh5KqIUJHwt4AaABAg                for   
5                          UgwM8uKpA9cgNrAlebF4AaABAg            against   
6                          UgzmahQHNmkXZsL7_-h4AaABAg            neutral   
7                          Ugyrsd7JXLL53mfGyKB4AaABAg            neutral   
8                          UgySHsZ9Y4UMoXPbgqB4AaABAg            neutral   
9                          UgwawVkimE9UITpWtDd4AaABAg                for   
10                         Ugyk4AqmVuNhEGOmAmV4AaABAg                for   
11                         UgwItktcdZrNX_1EFJ14AaABAg                for   
12          