In [2]:
import json
import pandas as pd

def convert_tbbt_json_to_csv(json_path, output_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    rows = []

    for ep_key, scenes in data.items():
        season, episode = ep_key.split("_")
        season = int(season)
        episode = int(episode)
        seid = f"S{season:02d}E{episode:02d}"

        for scene in scenes:
            turns = scene.get("Turns", [])

            for turn in turns:
                speaker = turn.get("Speaker", "")
                words = turn.get("Words", [])

                # Extract only the word token, ignore POS tag
                dialogue = " ".join(w[0] for w in words if len(w) > 0)

                # Skip empty lines
                if dialogue.strip() == "":
                    continue

                rows.append({
                    "Character": speaker,
                    "Dialogue": dialogue,
                    "EpisodeNo": episode,
                    "SEID": seid,
                    "Season": season
                })

    df = pd.DataFrame(rows)

    # Make column order match your example CSV exactly
    df = df[["Character", "Dialogue", "EpisodeNo", "SEID", "Season"]]

    df.to_csv(output_path, index=False)
    print(f"Done! Saved as {output_path}")


# Run conversion
convert_tbbt_json_to_csv("data/final.json", "tbbt_corpus_clean.csv")


Done! Saved as tbbt_corpus_clean.csv


In [5]:
import pandas as pd
tbbt_raw = pd.read_csv('raw_data/tbbt_corpus_clean.csv')
tbbt_raw.head()

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,Sheldon,photon directed plane two slits either slit ob...,1,S01E01,1
1,Leonard,agreed point,1,S01E01,1
2,Sheldon,point think good idea tee shirt,1,S01E01,1
3,Leonard,excuse,1,S01E01,1
4,Others,hang,1,S01E01,1


In [6]:
import pandas as pd
friends = pd.read_csv('raw_data/friends_dialogues.csv')
friends.head()

Unnamed: 0,episode_title,season,episode,character,dialogue
0,Monica Gets A Roommate,1.0,1.0,Monica,There's nothing to tell! He's just some guy I ...
1,Monica Gets A Roommate,1.0,1.0,Joey,"C'mon, you're going out with the guy! There's ..."
2,Monica Gets A Roommate,1.0,1.0,Chandler,"All right Joey, be nice. So does he have a hum..."
3,Monica Gets A Roommate,1.0,1.0,Phoebe,"Wait, does he eat chalk?"
4,Monica Gets A Roommate,1.0,1.0,Phoebe,"Just, 'cause, I don't want her to go through w..."


In [7]:
tbbt = tbbt_raw.copy()
tbbt = tbbt_raw.rename(columns={
    "Season": "season",
    "EpisodeNo": "episode",
    "Character": "character",
    "Dialogue": "dialogue"
})
tbbt["episode_title"] = ""
# Reorder columns
tbbt = tbbt[["episode_title", "season", "episode", "character", "dialogue"]]
tbbt.head()

Unnamed: 0,episode_title,season,episode,character,dialogue
0,,1,1,Sheldon,photon directed plane two slits either slit ob...
1,,1,1,Leonard,agreed point
2,,1,1,Sheldon,point think good idea tee shirt
3,,1,1,Leonard,excuse
4,,1,1,Others,hang


In [8]:
tbbt.to_csv('raw_data/tbbt_corpus.csv', index=False)