In [1]:
import pandas as pd
from pathlib import Path
import kagglehub

## Configs

In [40]:
CONTEXT_WINDOW = 4          # keep the last K non-Michael utterances as context
SRC_OUT = "src.txt"
TGT_OUT = "tgt.txt"
RESET_AFTER_MICHAEL = False  # start fresh context after each Michael reply

## Importing the dataset (from Kaggle)

In [2]:
# Download latest version
path = kagglehub.dataset_download("nasirkhalid24/the-office-us-complete-dialoguetranscript")

data_path = Path(path + "/The-Office-Lines-V4.csv")
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,season,episode,title,scene,speaker,line,Unnamed: 6
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...,
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So...",
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...,
3,1,1,Pilot,1,Jim,"Actually, you called me in here, but yeah.",
4,1,1,Pilot,1,Michael,"All right. Well, let me show you how it's done.",


In [None]:
df = df.copy()
df = df.dropna(subset=["speaker", "line"])
df["speaker"] = df["speaker"].astype(str).str.strip()
df["line"] = df["line"].astype(str).str.strip()

## Creating the datasets

In [41]:
def fmt_utt(speaker, line):
    return f"[{speaker.upper()}] {line}"

pairs = []  # (src, tgt)

# ---- build pairs per scene to keep context coherent ----
group_keys = ["season", "episode", "scene"]

for _, g in df.groupby(group_keys, sort=False):
    context = []  # rolling context of non-Michael utterances
    for _, r in g.iterrows():
        spk, line = r["speaker"], r["line"]
        if spk.strip().lower() == "michael":
            if context:  # only make a pair if we have some context
                src = " ".join(context[-CONTEXT_WINDOW:])
                tgt = fmt_utt("MICHAEL", line)
                pairs.append((src, tgt))
            # do NOT add Michael’s line to context, since src = "everyone except Michael"
        else:
            context.append(fmt_utt(spk, line))

print(f"Built {len(pairs)} src–tgt pairs.")

Built 8949 src–tgt pairs.


In [42]:
pairs = []
group_keys = [c for c in ["season","episode","scene"] if c in df.columns]
if not group_keys:
    df["_grp"] = 0
    group_keys = ["_grp"]

for _, g in df.groupby(group_keys, sort=False):
    context = []            # rolling context of non-Michael turns (merged per speaker)
    last_speaker = None     # to merge consecutive same-speaker lines
    last_src = None         # to avoid emitting identical src twice (e.g., back-to-back Michael)

    for _, r in g.iterrows():
        spk = r["speaker"].strip()
        line = r["line"].strip()

        if spk.lower() == "michael":
            if context:
                ctx = context[-CONTEXT_WINDOW:] if CONTEXT_WINDOW else context
                src = " ".join(ctx).strip()
                if src and src != last_src:
                    tgt = fmt_utt("MICHAEL", line)
                    pairs.append((src, tgt))
                    last_src = src
            # reset context after Michael so each pair only uses turns since his last reply
            if RESET_AFTER_MICHAEL:
                context, last_speaker = [], None

        else:
            # merge consecutive lines from the same non-Michael speaker
            if last_speaker == spk and context:
                context[-1] = context[-1] + " " + line
            else:
                context.append(fmt_utt(spk, line))
            last_speaker = spk

print(f"Built {len(pairs)} src–tgt pairs.")

Built 8863 src–tgt pairs.


In [43]:
# ---- write files (aligned 1–to–1) ----
with open(SRC_OUT, "w", encoding="utf-8") as fs, open(TGT_OUT, "w", encoding="utf-8") as ft:
    for src, tgt in pairs:
        fs.write(src.strip() + "\n")
        ft.write(tgt.strip() + "\n")

print(f"Wrote {SRC_OUT} and {TGT_OUT} (each with {len(pairs)} lines).")

Wrote src.txt and tgt.txt (each with 8863 lines).


In [50]:
len(max(df["line"].tolist(), key=len))

1154