<a href="https://colab.research.google.com/github/hacyuuglitch/FinalProject-for-Elective-4_UNLABELED/blob/main/Final_Project_ITE4(UNLABELED).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import joblib
import os

print("Loading dataset...")
df = pd.read_csv("data_mmda_traffic_spatial.csv")

# Keep only Tweet + Type
df = df[['Tweet', 'Type']].dropna()

# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Type"])

# Convert to HF Dataset
dataset = Dataset.from_pandas(df)

# Split 80/20
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print("Tokenizing...")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["Tweet"], padding=True, truncation=True)

tokenized = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized["train"]
eval_dataset = tokenized["test"]

# Rename label column
train_dataset = train_dataset.rename_column("label", "labels")
eval_dataset = eval_dataset.rename_column("label", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print(f"Training labels: {df['label'].nunique()} classes")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=df["label"].nunique()
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results_mmda", # This is the output directory for logs and checkpoints, not the final model path
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    load_best_model_at_end=True,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training model...")
trainer.train()

print("Evaluating...")
results = trainer.evaluate()
print(results)

# Define the final model save path
FINAL_MODEL_SAVE_PATH = "./mmda_bert_best"

# Ensure the directory exists before saving the model and label encoder
os.makedirs(FINAL_MODEL_SAVE_PATH, exist_ok=True)

trainer.save_model(FINAL_MODEL_SAVE_PATH)
joblib.dump(label_encoder, f"{FINAL_MODEL_SAVE_PATH}/label_encoder.pkl") # Save the label_encoder
print("Model and label encoder saved!")

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, pipeline
import joblib

# Define the correct model path
MODEL_PATH = "./mmda_bert_best"

# Load trained model
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)

# FIXED: Correct path to label_encoder.pkl
label_encoder = joblib.load(f"{MODEL_PATH}/label_encoder.pkl")

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def classify_incident(text):
    prediction = classifier(text)[0]['label']
    idx = int(prediction.split("_")[-1])  # label_0, label_1, ...
    return label_encoder.inverse_transform([idx])[0]

  df = pd.read_csv("data_mmda_traffic_spatial.csv")

df['Predicted_Type'] = df['Tweet'].apply(classify_incident)

print(df[['Tweet', 'Predicted_Type']].head())

import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, pipeline

# 1. Load your real dataset
df = pd.read_csv("data_mmda_traffic_spatial.csv")
df = df[['Tweet']].dropna()

# Use Tweet column as context
contexts = df['Tweet'].tolist()

# 2. Load DistilBERT QA model
qa_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)

# 3. Ask question
question = input("Enter your WH-question about the accident: ")

best_answer = None
best_score = 0
best_context = None

# 4. Search entire dataset for the best answer
for ctx in contexts:
    try:
        result = qa_pipeline(question=question, context=ctx)
        if result['score'] > best_score:
            best_score = result['score']
            best_answer = result['answer']
            best_context = ctx
    except:
        continue

# 5. Show answer
print("Question:", question)
print("Answer:", best_answer)
print("Confidence:", best_score)
print("From Tweet:", best_context)