In [55]:
# %%
import pandas as pd
import re

# Load the raw dataset
df = pd.read_csv("pomocne_tabulky/friends_script_with_eval.csv")
print(f"Loaded {len(df):,} rows.")

# Normalize evaluation (null = ok)
df["evaluation"] = df["evaluation"].fillna("ok")

# Preserve original IDs
df["original_line_id"] = df["line_id"]

# Speaker patterns that trigger splitting
speaker_patterns = [
    "Joey:",
    "Susan:",
    "chandler:",
    "Chandler:",
    "Chandler :",
    "Monica:",
    "Woman at door in a sing song voice:",
    "Ross and Rachel while looking at each other surprised and shocked:",
    "Ross with a look of wondering how long this is going to go on on his face:",
    "Ross starts talking over her 'do you remember' line:",
    "Ross first has a look of 'huh' then changes it to sarcastic happy:",
    "Monica about to have a heart attack:",
    "Phoebe while cutting a sweet potatoe in the air:",
    "Joey groans and gets off the phone:",
    "Joey all nervous and looking down and fiddling with his ear:",
    "Amy walks over to the couch and sits down next to Rachel:",
    "Rachel starts messing up Amy's hair:",
    "Everyone almost simultaneously except Ross:",
    "TV:",
    "Olivia:",
    "Rachel:",
    "2nd Customer:",
    "3rd Customer:",
    "Ross:",
    "Phoebe:",
    "Charlie:",
    "Bitter lady:",
    "Frank Jr. :",
    "Phoebe :"
]


Loaded 67,373 rows.


In [56]:
# %%
pattern_regex = re.compile(r"(" + "|".join(re.escape(p) for p in speaker_patterns) + r")")

def text_ok(evaluation: str) -> bool:
    """Treat both 'ok' and null-like values as OK."""
    return str(evaluation).strip().lower() in ["ok", "none", "nan", ""]


In [58]:
# %%
new_rows = []
line_ok = True
prev_scene = None
utterance_shift = 0

for idx, row in df.iterrows():
    text = str(row["text"])
    scene = row["scene"]
    utterance = int(row["utterance"])
    evaluation = row["evaluation"]
    speaker = row["speaker"]

    # Reset per scene
    if prev_scene is None or scene != prev_scene:
        line_ok = True
        utterance_shift = 0
    prev_scene = scene

    # ‚úÖ OK lines (keep as-is)
    if text_ok(evaluation):
        new_rows.append({
            "original_line_id": row["original_line_id"],
            "text": text.strip(),
            "speaker": speaker,
            "season": row["season"],
            "episode": row["episode"],
            "scene": row["scene"],
            "utterance": utterance + utterance_shift,
            "EPISODE_ID": row["EPISODE_ID"],
            "evaluation": "ok"
        })
        continue

    # ‚ùå Not OK lines ‚Äî may have multiple speaker segments
    pattern_regex_multi = re.compile("(" + "|".join(re.escape(p) for p in speaker_patterns) + ")")
    matches = list(pattern_regex_multi.finditer(text))

    if not matches:
        # nothing to split
        new_rows.append({
            "original_line_id": row["original_line_id"],
            "text": text.strip(),
            "speaker": speaker,
            "season": row["season"],
            "episode": row["episode"],
            "scene": row["scene"],
            "utterance": utterance + utterance_shift,
            "EPISODE_ID": row["EPISODE_ID"],
            "evaluation": "ok_split"
        })
        continue

    prev_end = 0
    current_speaker = speaker
    parts = []

    for i, m in enumerate(matches):
        # Text before this match belongs to current speaker
        pre_text = text[prev_end:m.start()].strip()
        if pre_text:
            parts.append((current_speaker, pre_text))

        # Update current speaker to new one
        current_speaker = m.group(1).strip(":").strip()

        # Text after this label, until next match
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        segment_text = text[start:end].strip()
        if segment_text:
            parts.append((current_speaker, segment_text))

        prev_end = end

    # Add split segments
    for j, (spk, seg_text) in enumerate(parts):
        new_rows.append({
            "original_line_id": row["original_line_id"],
            "text": seg_text,
            "speaker": spk,
            "season": row["season"],
            "episode": row["episode"],
            "scene": row["scene"],
            "utterance": utterance + utterance_shift + j,
            "EPISODE_ID": row["EPISODE_ID"],
            "evaluation": "ok_split"
        })

    utterance_shift += len(parts) - 1


In [59]:
new_df = pd.DataFrame(new_rows)

# Sort scenes before renumbering
new_df = new_df.sort_values(["season", "episode", "scene", "utterance"]).reset_index(drop=True)

# Assign new local numbering per episode
new_df["line_id_global"] = range(1, len(new_df) + 1)
new_df["line_id_local"] = new_df.groupby("EPISODE_ID", group_keys=False).cumcount() + 1
new_df["line_id"] = new_df["line_id_local"]

print("‚úÖ Line IDs renumbered per episode.")


‚úÖ Line IDs renumbered per episode.


In [60]:
# Save full cleaned dataset
path_clean = "pomocne_tabulky/friends_lines_cleaned.csv"
new_df.to_csv(path_clean, index=False)

# Save split-only subset for review
split_df = new_df.query("evaluation == 'ok_split'")
path_split = "pomocne_tabulky/friends_lines_split_only.csv"
split_df.to_csv(path_split, index=False)

print(f"‚úÖ Cleaning complete.")
print(f"   Original rows: {len(df)}")
print(f"   New rows:      {len(new_df)}")
print(f"   Split lines:   {len(split_df)}")
print(f"üíæ Output saved ‚Üí {path_clean}")
print(f"üíæ Split-only saved ‚Üí {path_split}")


‚úÖ Cleaning complete.
   Original rows: 67373
   New rows:      67425
   Split lines:   102
üíæ Output saved ‚Üí pomocne_tabulky/friends_lines_cleaned.csv
üíæ Split-only saved ‚Üí pomocne_tabulky/friends_lines_split_only.csv


In [61]:
split_df = pd.read_csv("pomocne_tabulky/friends_lines_split_only.csv")
split_counts = split_df["original_line_id"].value_counts()
print(f"{len(split_counts)} original lines were split")
print(split_counts.head(5))



50 original lines were split
original_line_id
57168    3
60319    3
60653    3
10023    2
2201     2
Name: count, dtype: int64


In [62]:
# show each original 'not ok' line with its split segments
for orig_id, group in split_df.groupby("original_line_id"):
    print(f"\nüü¶ Original line ID: {orig_id}")
    for i, row in group.iterrows():
        print(f"  üë§ {row['speaker']}: {row['text']}")



üü¶ Original line ID: 89
  üë§ Monica Geller: Maybe.
  üë§ Joey: Wait. Your 'not a real date' tonight is with Paul the Wine Guy?

üü¶ Original line ID: 2201
  üë§ Ross Geller: Oh, I uh, just came by to pick up my skull. Well, not mine, but...
  üë§ Susan: Come in.

üü¶ Original line ID: 10023
  üë§ Joey Tribbiani: I know.
  üë§ chandler: Now we can finally watch Green Acres the way it was meant to be seen.

üü¶ Original line ID: 56194
  üë§ Chandler Bing: DO NOT DISTURB DO NOT DISTURB!

üü¶ Original line ID: 57168
  üë§ Woman At Door: It's your favorite sister.
  üë§ Ross and Rachel while looking at each other surprised and shocked: Jill?
  üë§ Woman at door in a sing song voice: Amy.

üü¶ Original line ID: 57191
  üë§ Amy Green: No, he was this creepy guy from high school who had this huge crush on her since like the ninth grade.
  üë§ Ross with a look of wondering how long this is going to go on on his face: Still me.

üü¶ Original line ID: 57192
  üë§ Amy Green: