In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Using CPU.")

In [None]:
!sudo apt update
!sudo apt install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh
!pip install ollama

In [None]:
# Make Ollama run like a service
import time
import threading
import subprocess

def ollama_service_start():
  subprocess.Popen(['ollama', 'serve'])

thread = threading.Thread(target=ollama_service_start)
thread.start()
time.sleep(5)

In [None]:
!ollama pull deepseek-r1

In [None]:
!ollama list

In [None]:
DEEPSEEK_MODEL = 'deepseek-r1:latest'

In [None]:
import re

def remove_think_tag(content):
  pattern = "<think>(.|\s)*?<\/think>"
  return re.sub(pattern, "", content).strip()

In [None]:
import ollama
def classify_vehicle_related(text):
    prompt = f"""
    You are a highly accurate classifier that determines whether a given question-answer pair is related to vehicles or not.

    A vehicle is any means of transport, including but not limited to: cars, bicycles, motorcycles, trucks, buses, trains, airplanes, and boats.

    Please analyze the following input and respond with only "Yes" if it is related to vehicles, and "No" otherwise.
    
    Input: "{text}"
    Output:
    """

    response = ollama.chat(model=DEEPSEEK_MODEL, messages=[{"role": "user", "content": prompt}])
    return response["message"]["content"].strip()

# Test
text = "what is the color of the sign ? - red"
result = classify_vehicle_related(text)
print(result)

In [None]:
import json
with open("/kaggle/input/test-raw-data/cocoqa_raw_test.json", 'r') as f:
    train_data = json.load(f)

In [None]:
import json
import os

BATCH_SIZE = 50
filtered_data_path = "/kaggle/input/checkpoint/vehicle_raw_test.json"
SAVE_PATH = "/kaggle/working/filtered_data_test.json"

filtered_data = []

if os.path.exists(filtered_data_path):
    with open(filtered_data_path, "r") as f:
        try:
            filtered_data = json.load(f)
        except json.JSONDecodeError:
            filtered_data = []

q = "what are there coming down the street to a green light ?"
start_idx = next((i + 1 for i, item in enumerate(train_data) 
                  if item['question'] == q), -1)

print(f'continue to classify questions from index {start_idx}')
for idx, item in enumerate(train_data, start=1):
    qa_pair = f"{idx}. {item['question']} - {item['ans']}"
    result = remove_think_tag(classify_vehicle_related(qa_pair))
    print(f"{qa_pair} : {result}")

    if "Yes" not in result:
        continue

    filtered_data.append(item)

    # Cứ sau mỗi BATCH_SIZE lần thì lưu vào file
    if idx % BATCH_SIZE == 0:
        with open(SAVE_PATH, "w") as f:
            json.dump(filtered_data, f, indent=4)
        print(f"Saved {len(filtered_data)} samples at {SAVE_PATH}")

with open(SAVE_PATH, "w") as f:
    json.dump(filtered_data, f, indent=4)
