In [3]:
import pandas as pd
import re

df = pd.read_csv("pomocne_tabulky/friends_lines_cleaned.csv")
print(f"Loaded {len(df):,} rows.")

# Your direction-like speakers (as you listed them)
direction_speakers_raw = [
    "Ross and Rachel while looking at each other surprised and shocked",
    "Ross with a look of wondering how long this is going to go on on his face",
    "Ross starts talking over her 'do you remember' line",
    "Ross first has a look of 'huh' then changes it to sarcastic happy",
    "Monica about to have a heart attack",
    "Phoebe while cutting a sweet potatoe in the air",
    "Joey groans and gets off the phone",
    "Joey all nervous and looking down and fiddling with his ear",
    "Amy walks over to the couch and sits down next to Rachel",
    "Rachel starts messing up Amy's hair",
]

# Normalize: strip, remove trailing colon, lowercase
def norm_s(x: str) -> str:
    return re.sub(r"\s*:\s*$", "", str(x).strip()).casefold()

direction_set = {norm_s(s) for s in direction_speakers_raw}

# Build a boolean mask of matches (normalized)
speaker_norm = df["speaker"].apply(norm_s)
mask = speaker_norm.isin(direction_set)
print(f"üé¨ Found {mask.sum()} rows with direction-like speakers (normalized match).")

df.loc[mask, ["original_line_id","speaker","text"]].head(10)


Loaded 67,425 rows.
üé¨ Found 10 rows with direction-like speakers (normalized match).


Unnamed: 0,original_line_id,speaker,text
57171,57168,Ross and Rachel while looking at each other su...,Jill?
57196,57191,Ross with a look of wondering how long this is...,Still me.
57198,57192,Ross starts talking over her 'do you remember'...,"Amy. I'm going to save you some time, ok. All ..."
57265,57258,Ross first has a look of 'huh' then changes it...,Thank you Amy.
57295,57287,Monica about to have a heart attack,"Okay, listen I know you're having a little bit..."
57312,57303,Phoebe while cutting a sweet potatoe in the air,No you're all about the fun.
57331,57321,Joey groans and gets off the phone,The producer from Days left a message on my ma...
57335,57324,Joey all nervous and looking down and fiddling...,Oh.. My sister's raccoon.
57344,57332,Amy walks over to the couch and sits down next...,"Ucch. Uchh In case you hadn't noticed, I'm not..."
57386,57373,Rachel starts messing up Amy's hair,Frizzy frizzy frizzy frizzy!!


In [5]:
new_rows = []
for _, row in df.iterrows():
    spk = row["speaker"]
    row_dict = row.to_dict()  # convert once

    if norm_s(spk) in direction_set:
        # 1Ô∏è‚É£ Insert Scene Directions row first
        new_rows.append({
            **row_dict,
            "speaker": "Scene Directions",
            "text": spk if spk else ""
        })
        # 2Ô∏è‚É£ Then the modified original row with Unknown speaker
        modified = row_dict.copy()
        modified["speaker"] = "Unknown"
        new_rows.append(modified)
    else:
        # normal row ‚Äî append as dict
        new_rows.append(row_dict)

new_df = pd.DataFrame(new_rows)
print(f"‚û°Ô∏è Rows before: {len(df):,} | after: {len(new_df):,} | inserted: {len(new_df)-len(df):,}")

# Quick preview for a known case
preview_ids = [57168, 57192, 57332]
preview = new_df[new_df["original_line_id"].isin(preview_ids)][
    ["original_line_id","speaker","text"]
].head(20)
preview



‚û°Ô∏è Rows before: 67,425 | after: 67,435 | inserted: 10


Unnamed: 0,original_line_id,speaker,text
57170,57168,Woman At Door,It's your favorite sister.
57171,57168,Scene Directions,Ross and Rachel while looking at each other su...
57172,57168,Unknown,Jill?
57173,57168,Woman at door in a sing song voice,Amy.
57199,57192,Amy Green,"No, I'm not talking about you. It was your fat..."
57200,57192,Scene Directions,Ross starts talking over her 'do you remember'...
57201,57192,Unknown,"Amy. I'm going to save you some time, ok. All ..."
57351,57332,Chandler Bing,"Oh no thats okay, you're totally right. I don'..."
57352,57332,Scene Directions,Amy walks over to the couch and sits down next...
57353,57332,Unknown,"Ucch. Uchh In case you hadn't noticed, I'm not..."


In [21]:
import pandas as pd
import re

# --- 1) Load the latest cleaned file ---
df = pd.read_csv("pomocne_tabulky/friends_lines_cleaned.csv")
print(f"Loaded {len(df):,} rows.")

# --- 2) Direction-like speaker definitions ---
direction_speakers_raw = [
    "Ross and Rachel while looking at each other surprised and shocked",
    "Ross with a look of wondering how long this is going to go on on his face",
    "Ross starts talking over her 'do you remember' line",
    "Ross first has a look of 'huh' then changes it to sarcastic happy",
    "Monica about to have a heart attack",
    "Phoebe while cutting a sweet potatoe in the air",
    "Joey groans and gets off the phone",
    "Joey all nervous and looking down and fiddling with his ear",
    "Amy walks over to the couch and sits down next to Rachel",
    "Rachel starts messing up Amy's hair",
]

def norm_s(x: str) -> str:
    """Normalize speaker string for matching."""
    return re.sub(r"\s*:\s*$", "", str(x).strip()).casefold()

direction_set = {norm_s(s) for s in direction_speakers_raw}

# --- 3) Identify direction-like speaker rows ---
speaker_norm = df["speaker"].apply(norm_s)
mask = speaker_norm.isin(direction_set)
print(f"üé¨ Found {mask.sum()} direction-like rows.")

# --- 4) Insert scene direction rows ---
new_rows = []
for _, row in df.iterrows():
    spk = row["speaker"]
    row_dict = row.to_dict()

    if norm_s(spk) in direction_set:
        # 1Ô∏è‚É£ Insert Scene Directions row first
        new_rows.append({
            **row_dict,
            "speaker": "Scene Directions",
            "text": spk if spk else "",
            "evaluation": "added_scene_dir"
        })
        # 2Ô∏è‚É£ Then the modified original row with Unknown speaker
        modified = row_dict.copy()
        modified["speaker"] = "Unknown"
        modified["evaluation"] = "updated_scene_dir"
        new_rows.append(modified)
    else:
        new_rows.append(row_dict)

new_df = pd.DataFrame(new_rows)
print(f"‚û°Ô∏è Rows before: {len(df):,} | after: {len(new_df):,} | inserted: {len(new_df)-len(df):,}")

# --- 5) Reset utterance numbers per scene ---
new_df["utterance"] = new_df.groupby(["season", "episode", "scene"], group_keys=False).cumcount() + 1

# --- 6) Renumber line IDs globally ---
new_df = new_df.sort_values(["season", "episode", "scene", "utterance"]).reset_index(drop=True)
new_df["line_id"] = range(1, len(new_df) + 1)

print("‚úÖ Renumbered utterance and global line_id fields.")

# --- 7) Quick review preview ---
preview_ids = [57168, 57192, 57332]
preview = new_df[new_df["original_line_id"].isin(preview_ids)][
    ["original_line_id","speaker","text","evaluation","utterance","line_id"]
]
print("\nüîç Preview of modified scenes:")
display(preview)

# --- 8) save
new_df.to_csv("pomocne_tabulky/friends_lines_FINAL.csv", index=False)
print("üíæ Saved ‚Üí friends_lines_FINAL.csv")


Loaded 67,425 rows.
üé¨ Found 10 direction-like rows.
‚û°Ô∏è Rows before: 67,425 | after: 67,435 | inserted: 10
‚úÖ Renumbered utterance and global line_id fields.

üîç Preview of modified scenes:


Unnamed: 0,original_line_id,speaker,text,evaluation,utterance,line_id
57170,57168,Woman At Door,It's your favorite sister.,ok_split,26,57171
57171,57168,Scene Directions,Ross and Rachel while looking at each other su...,added_scene_dir,27,57172
57172,57168,Unknown,Jill?,updated_scene_dir,28,57173
57173,57168,Woman at door in a sing song voice,Amy.,ok_split,29,57174
57199,57192,Amy Green,"No, I'm not talking about you. It was your fat...",ok_split,55,57200
57200,57192,Scene Directions,Ross starts talking over her 'do you remember'...,added_scene_dir,56,57201
57201,57192,Unknown,"Amy. I'm going to save you some time, ok. All ...",updated_scene_dir,57,57202
57351,57332,Chandler Bing,"Oh no thats okay, you're totally right. I don'...",ok_split,207,57352
57352,57332,Scene Directions,Amy walks over to the couch and sits down next...,added_scene_dir,208,57353
57353,57332,Unknown,"Ucch. Uchh In case you hadn't noticed, I'm not...",updated_scene_dir,209,57354


üíæ Saved ‚Üí friends_lines_FINAL.csv
