In [1]:
from openai import OpenAI
import json
from config import API_KEY
import logging
import os
import pandas as pd
from tqdm import tqdm
from omegaconf import OmegaConf
from demo import Runner
import torch

## Model

In [2]:
df = pd.read_csv("./data/train.csv")
df_test = df[df['is_train'] == False].reset_index(drop=True)

In [4]:
config_dict = {
    "model_path": "/home/user/Desktop/nlp_project/mvit32-2.onnx", 
    "video_path": "",                          
    "frame_interval": 1,                       
    "mean": [123.675, 116.28, 103.53],            
    "std": [58.395, 57.12, 57.375],             
}
conf = OmegaConf.create(config_dict)

In [None]:
for idx, row in tqdm(df_test.iterrows(), total=len(df_test)):
    
    video_name = row["video_name"]
    

    video_path = os.path.join("./data/video_segments", video_name + ".mp4")
    if not os.path.exists(video_path):
        continue
    conf.video_path = video_path  
    try:
        runner = Runner(conf.model_path, conf, mp=False, verbose=True)
        
        print(f"\nProcessing video: {video_name}")
        runner.run()
        
        predictions = list(runner.prediction_list)
        with open("model_preds.txt","a") as f:
            f.write(video_name)
            f.write("\n")
            f.write(" ".join(predictions))
            f.write("\n\n")

        del runner
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"Error processing {video_name}: {e}")
        continue

In [3]:
with open("model_preds.txt", "r") as f:
    preds = f.read()

data = {}
preds = preds.split("\n\n")
for p in preds:
    if p != "":
        p = p.split("\n")
        data[p[0]] = {"preds" : p[1].replace("--- ", '')}

In [4]:
for i in range(len(df_test)):
    name = df_test.iloc[i]['video_name']
    glosses = df_test.iloc[i]['glosses']
    if name in data:
        data[name]['real'] = glosses

In [7]:
def levenshtein_distance(ref_words, hyp_words):
    n = len(ref_words)
    m = len(hyp_words)
    d = [[0] * (m + 1) for _ in range(n + 1)]
    for i in range(n + 1):
        d[i][0] = i
    for j in range(m + 1):
        d[0][j] = j
        
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1
            d[i][j] = min(
                d[i - 1][j] + 1,   
                d[i][j - 1] + 1,     
                d[i - 1][j - 1] + cost  
            )
    return d[n][m]

def wer_custom(reference, hypothesis):
    if not isinstance(reference, str):
        reference = ""
    if not isinstance(hypothesis, str):
        hypothesis = ""
    ref_words = reference.strip().split()
    hyp_words = hypothesis.strip().split()
    if len(ref_words) == 0:
        return float('inf')
    distance = levenshtein_distance(ref_words, hyp_words)
    return distance / len(ref_words)

In [None]:
wer_results = {}
for key, value in data.items():
    if 'real' in value:
        ref = value['real']
        hyp = value['preds']
        wer_value = wer_custom(ref, hyp)
        wer_results[key] = wer_value

In [None]:
average_wer = sum(wer_results.values()) / len(wer_results)
print(f"Average WER over {len(wer_results)} samples: {average_wer:.3f}")

## LLM

In [12]:
client = OpenAI(api_key=API_KEY)

In [13]:
def send_data_to_chatgpt(data, prompt):
    prompt += data
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [6]:
for i in range(len(df_test)):
    name = df_test.iloc[i]['video_name']
    transcript = df_test.iloc[i]['transcript']
    if name in data:
        data[name]['real_transcript'] = transcript

In [17]:
for key, value in data.items():
    if 'real' in value and 'preds' in value and 'transcript' in value:
        glosses = value['preds']
        prediction = send_data_to_chatgpt(glosses, prompt)
        data[key]['pred_trancript'] = prediction