In [2]:
import json
import pandas as pd

def convert_tbbt_json_to_csv(json_path, output_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    rows = []

    for ep_key, scenes in data.items():
        season, episode = ep_key.split("_")
        season = int(season)
        episode = int(episode)
        seid = f"S{season:02d}E{episode:02d}"

        for scene in scenes:
            turns = scene.get("Turns", [])

            for turn in turns:
                speaker = turn.get("Speaker", "")
                words = turn.get("Words", [])

                # Extract only the word token, ignore POS tag
                dialogue = " ".join(w[0] for w in words if len(w) > 0)

                # Skip empty lines
                if dialogue.strip() == "":
                    continue

                rows.append({
                    "Character": speaker,
                    "Dialogue": dialogue,
                    "EpisodeNo": episode,
                    "SEID": seid,
                    "Season": season
                })

    df = pd.DataFrame(rows)

    # Make column order match your example CSV exactly
    df = df[["Character", "Dialogue", "EpisodeNo", "SEID", "Season"]]

    df.to_csv(output_path, index=False)
    print(f"Done! Saved as {output_path}")


# Run conversion
convert_tbbt_json_to_csv("data/final.json", "tbbt_corpus_clean.csv")


Done! Saved as tbbt_corpus_clean.csv
