In [1]:
import pandas as pd

# Load current dataset
df = pd.read_csv("pomocne_tabulky/friends_lines_FINAL.csv")

# Filter just episode S9E8
s9e8 = df[df["EPISODE_ID"] == "S9E8"].copy()

# Exclude existing Scene Directions (we‚Äôre only editing actual dialogues)
s9e8 = s9e8[s9e8["speaker"] != "Scene Directions"]

# Add helper columns for manual editing
s9e8["manual_flag"] = ""
s9e8["new_scene"] = ""
s9e8["split_marker"] = ""

out_path = "pomocne_tabulky/S9E8_manual_split_template.csv"
s9e8.to_csv(out_path, index=False)
print(f"üíæ Exported template for manual splitting ‚Üí {out_path}")
print("üëâ You can open this in Excel or VS Code and make manual splits.")


üíæ Exported template for manual splitting ‚Üí pomocne_tabulky/S9E8_manual_split_template.csv
üëâ You can open this in Excel or VS Code and make manual splits.


In [3]:
import pandas as pd

# --- 1Ô∏è‚É£ Load file ---
df = pd.read_csv("pomocne_tabulky/S9E8_manual_split_template.csv")

# --- 2Ô∏è‚É£ Clean up columns ---
df.columns = df.columns.str.strip()
df["scene"] = pd.to_numeric(df["scene"], errors="coerce")
df["EPISODE_ID"] = df["EPISODE_ID"].astype(str).str.strip()

# --- 3Ô∏è‚É£ Sort by episode and scene order ---
df = df.sort_values(["season", "episode", "scene", "original_line_id"]).reset_index(drop=True)

# --- 4Ô∏è‚É£ Recalculate utterances per scene ---
df["utterance"] = df.groupby("scene", group_keys=False).cumcount() + 1

# --- 5Ô∏è‚É£ Recalculate IDs (simple + consistent) ---
df["line_id_local"] = df.index + 1
df["line_id"] = df["line_id_local"]
df["line_id_global"] = df.index + 1

# --- 6Ô∏è‚É£ Quick verification ---
scene_summary = (
    df.groupby("scene")["utterance"]
    .max()
    .reset_index(name="lines_per_scene")
)
print("üìä Lines per scene:")
print(scene_summary)

# show first and last utterance of each scene to verify resets
print("\nüîç Scene boundary preview:")
for scene_id, group in df.groupby("scene"):
    first = group.head(1)[["scene", "utterance", "speaker", "text"]]
    last = group.tail(1)[["scene", "utterance", "speaker", "text"]]
    print(f"\nScene {scene_id}:")
    display(pd.concat([first, last]))

# --- 7Ô∏è‚É£ Save file ---
df.to_csv("pomocne_tabulky/friends_s9e8_recounted.csv", index=False)
print("\nüíæ Saved ‚Üí friends_s9e8_recounted.csv")


üìä Lines per scene:
   scene  lines_per_scene
0      1              266
1      2               14

üîç Scene boundary preview:

Scene 1:


Unnamed: 0,scene,utterance,speaker,text
0,1,1,Monica Geller,"Hey Hon, could you help me get the plates down?"
265,1,266,Chandler Bing,Oh no no no.. I'll get her. I'm super-compente...



Scene 2:


Unnamed: 0,scene,utterance,speaker,text
266,2,1,Phoebe Buffay,"Hey, does Monica know about her broken plates ..."
279,2,14,Joey Tribbiani,Uhm... Yeah... this uhm... raccoon came in...



üíæ Saved ‚Üí friends_s9e8_recounted.csv


In [4]:
print(df["line_id"].is_monotonic_increasing)  # Should be True
print(df["line_id"].is_unique)                # Should be True
print(df["line_id"].min(), df["line_id"].max(), len(df))


True
True
1 280 280


In [5]:
df[["scene", "utterance", "line_id"]].head(10)
#  df[["scene", "utterance", "line_id"]].tail(10)


Unnamed: 0,scene,utterance,line_id
0,1,1,1
1,1,2,2
2,1,3,3
3,1,4,4
4,1,5,5
5,1,6,6
6,1,7,7
7,1,8,8
8,1,9,9
9,1,10,10


In [6]:
# df.groupby("scene")["utterance"].min().eq(1).all()   # every scene starts with 1
df.groupby("scene")["utterance"].max().nunique()     # variation in utterance count per scene


2

In [7]:
df_all = pd.read_csv("pomocne_tabulky/friends_lines_FINAL.csv")


In [8]:
# 1Ô∏è‚É£ Remove old version
df_all = df_all[df_all["EPISODE_ID"] != "S9E8"]

# 2Ô∏è‚É£ Add the fixed one
df_fixed = pd.read_csv("pomocne_tabulky/friends_s9e8_recounted.csv")
df_all = pd.concat([df_all, df_fixed], ignore_index=True)

# 3Ô∏è‚É£ Recount global line IDs (if you want them sequential across all episodes)
df_all = df_all.sort_values(["season", "episode", "scene", "utterance"]).reset_index(drop=True)
df_all["line_id_global"] = df_all.index + 1


In [9]:
print(df_all["line_id_global"].is_unique)
print(df_all["EPISODE_ID"].value_counts().loc["S9E8"])


True
280


In [10]:
df_all.groupby("EPISODE_ID")["line_id_global"].agg(["min", "max", "count"])


Unnamed: 0_level_0,min,max,count
EPISODE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S10E1,61942,62297,356
S10E10,64663,64957,295
S10E11,64958,65301,344
S10E12,65302,65628,327
S10E13,65629,65955,327
...,...,...,...
S9E5,56314,56578,265
S9E6,56579,56913,335
S9E7,56914,57145,232
S9E8,57146,57425,280


In [11]:
df_all.to_csv("friends_script_CLEAN.csv", index=False)
print("üíæ Merged and saved ‚Üí friends_script_CLEAN.csv")

üíæ Merged and saved ‚Üí friends_script_CLEAN.csv


In [17]:
# --- 1Ô∏è‚É£ Load your dataset ---
df = pd.read_csv("friends_script_CLEAN.csv")

# --- 2Ô∏è‚É£ Drop unnecessary columns ---
cols_to_drop = [
    "original_line_id",
    "evaluation",
    "line_id_local",
    "line_id",
    "manual_flag",
    "new_scene",
    "split_marker",
]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# --- 3Ô∏è‚É£ Convert season and episode to integers (nullable if needed) ---
df["season"] = df["season"].astype(float).astype("Int64")
df["episode"] = df["episode"].astype(float).astype("Int64")

# --- 4Ô∏è‚É£ Move 'line_id_global' to the front ---
cols = df.columns.tolist()
if "line_id_global" in cols:
    cols.insert(0, cols.pop(cols.index("line_id_global")))
    df = df[cols]

# --- 5Ô∏è‚É£ Optional: sort ---
df = df.sort_values(["season", "episode", "scene", "utterance"]).reset_index(drop=True)

# --- 6Ô∏è‚É£ Save cleaned dataset ---
out_path = "friends_script_CLEAN_MIN.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")

print(f"‚úÖ Saved cleaned version ‚Üí {out_path}")
print("üßº Columns now:", df.columns.tolist())

‚úÖ Saved cleaned version ‚Üí friends_script_CLEAN_MIN.csv
üßº Columns now: ['line_id_global', 'text', 'speaker', 'season', 'episode', 'scene', 'utterance', 'EPISODE_ID']
