In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("Data/FRIENDS_SCRIPT.csv")

# Check the structure
print(df.head())

# Count number of unique scenes per episode
scenes_per_episode = (
    df.groupby(["season", "episode"])["scene"]
    .nunique()
    .reset_index(name="num_scenes")
)

print(scenes_per_episode.head())


                                                text           speaker  \
0  There's nothing to tell! He's just some guy I ...     Monica Geller   
1  C'mon, you're going out with the guy! There's ...    Joey Tribbiani   
2  All right Joey, be nice. So does he have a hum...     Chandler Bing   
3                           Wait, does he eat chalk?     Phoebe Buffay   
4                         (They all stare, bemused.)  Scene Directions   

   season  episode  scene  utterance EPISODE_ID  
0       1        1      1          1       S1E1  
1       1        1      1          2       S1E1  
2       1        1      1          3       S1E1  
3       1        1      1          4       S1E1  
4       1        1      1          5       S1E1  
   season  episode  num_scenes
0       1        1          15
1       1        2          11
2       1        3          14
3       1        4          16
4       1        5          16


In [5]:
df.groupby(["season", "episode", "scene"])["utterance"].max().head()

season  episode  scene
1       1        1        58
                 2        62
                 3         1
                 4        19
                 5         9
Name: utterance, dtype: int64

In [None]:
scenes_per_episode = (
    df.groupby(["season", "episode"])["scene"]
    .nunique()
    .reset_index(name="num_scenes")
)
scenes_per_episode

Unnamed: 0,season,episode,num_scenes
0,1,1,15
1,1,2,11
2,1,3,14
3,1,4,16
4,1,5,16
...,...,...,...
231,10,14,13
232,10,15,13
233,10,16,14
234,10,17,14


In [9]:
# For each episode, check if utterance numbers are strictly increasing
episode_continuity = (
    df.groupby("EPISODE_ID")["utterance"]
    .apply(lambda x: x.is_monotonic_increasing)
    .reset_index(name="is_monotonic")
)

# Count how many are True / False
print(episode_continuity["is_monotonic"].value_counts())

is_monotonic
False    236
Name: count, dtype: int64


In [10]:
# List the problematic ones
broken_episodes = episode_continuity[~episode_continuity["is_monotonic"]]
print(broken_episodes)

    EPISODE_ID  is_monotonic
0        S10E1         False
1       S10E10         False
2       S10E11         False
3       S10E12         False
4       S10E13         False
..         ...           ...
231       S9E5         False
232       S9E6         False
233       S9E7         False
234       S9E8         False
235       S9E9         False

[236 rows x 2 columns]


In [11]:
avg_scenes_per_season = (
    df.groupby("season")["scene"]
    .nunique()
    .groupby(df["season"])
    .mean()
)
print(avg_scenes_per_season)

season
1    19.5
Name: scene, dtype: float64


In [12]:
print("Min scenes in an episode:", scenes_per_episode["num_scenes"].min())
print("Max scenes in an episode:", scenes_per_episode["num_scenes"].max())
print("Average scenes per episode:", scenes_per_episode["num_scenes"].mean())

Min scenes in an episode: 2
Max scenes in an episode: 29
Average scenes per episode: 13.165254237288135


In [13]:
import pandas as pd

# First, get the number of scenes per episode
scenes_per_episode = (
    df.groupby("EPISODE_ID")["scene"]
    .nunique()
    .reset_index(name="num_scenes")
)

# Then, count how many episodes fall into each "num_scenes" value
scene_distribution = (
    scenes_per_episode["num_scenes"]
    .value_counts()
    .sort_index()
    .reset_index()
    .rename(columns={"index": "num_scenes", "num_scenes": "episode_count"})
)

print(scene_distribution)

    episode_count  count
0               2      1
1               4      4
2               6      6
3               7      4
4               8      6
5               9      6
6              10      7
7              11     18
8              12     29
9              13     48
10             14     30
11             15     30
12             16     28
13             17      8
14             18      3
15             19      2
16             20      2
17             21      2
18             26      1
19             29      1


In [16]:
# Calculate number of scenes per episode
scenes_per_episode = (
    df.groupby("EPISODE_ID")["scene"]
    .nunique()
    .reset_index(name="num_scenes")
)

# Define suspicious scene counts
suspicious_counts = [2, 4, 6, 7, 8, 9, 26, 29]

# Filter the episodes and sort
suspicious_eps = (
    scenes_per_episode[
        scenes_per_episode["num_scenes"].isin(suspicious_counts)
    ]
    .sort_values("num_scenes", ascending=True)
    .reset_index(drop=True)
)

print(suspicious_eps)

   EPISODE_ID  num_scenes
0        S9E8           2
1       S10E4           4
2       S9E18           4
3        S3E2           4
4        S8E9           4
5       S5E19           6
6        S2E6           6
7        S3E9           6
8       S7E21           6
9        S8E1           6
10       S8E6           6
11      S8E18           7
12       S2E5           7
13     S10E10           7
14      S4E16           7
15      S1E10           8
16       S6E9           8
17       S7E8           8
18      S2E22           8
19      S1E18           8
20      S4E19           8
21      S10E8           9
22       S1E6           9
23      S2E10           9
24       S4E8           9
25      S1E19           9
26       S5E1           9
27      S4E24          26
28      S4E21          29


In [30]:
# Episodes with unusually few or many scenes
too_few_or_many = scenes_per_episode[
    (scenes_per_episode["num_scenes"] < 3) | 
    (scenes_per_episode["num_scenes"] > 25)
]

print(too_few_or_many)

    EPISODE_ID  num_scenes
104      S4E21          29
107      S4E24          26
234       S9E8           2


In [32]:
# Filter for episode S9E8
ep = df[df["EPISODE_ID"] == "S9E8"]

# Mask for text containing ":" and speaker not being "Scene Directions"
mask = ep["text"].str.contains(":", na=False) & (ep["speaker"] != "Scene Directions")

# Apply filter
ep_with_colon = ep[mask].sort_values("utterance")

# Display relevant columns
print(ep_with_colon[["speaker", "text", "scene", "utterance"]])


              speaker  \
57408   Phoebe Buffay   
57160  Joey Tribbiani   
57167   Woman At Door   
57190       Amy Green   
57191       Amy Green   
57257       Amy Green   
57286    Rachel Green   
57302   Monica Geller   
57320   Monica Geller   
57323   Phoebe Buffay   
57331   Chandler Bing   
57372   Monica Geller   

                                                                                                                                                                                                                                                                                                                                                     text  \
57408                                                                                                                                                                                                                                                                                                                  No problem! Ne

In [33]:
# Filter for episode S9E8
ep = df[df["EPISODE_ID"] == "S9E8"].sort_values("utterance")

# Save the entire episode to a new CSV file
ep.to_csv("S9E8_full.csv", index=False, encoding="utf-8")

print(f"✅ Saved full episode {ep['EPISODE_ID'].iloc[0]} with {len(ep)} lines to S9E8_full.csv")


✅ Saved full episode S9E8 with 271 lines to S9E8_full.csv


In [35]:
# 1️⃣ Filter episode S9E8 in original order
ep = df[df["EPISODE_ID"] == "S9E8"].copy()

# 2️⃣ Mark lines that likely need manual splitting
ep["needs_split"] = ep["text"].str.contains(":", na=False) & (ep["speaker"] != "Scene Directions")

# 3️⃣ Save clean copy
ep.to_csv("S9E8_full_original_marked.csv", index=False, encoding="utf-8")

# 4️⃣ Show quick summary
n_split = ep["needs_split"].sum()
print(f"✅ Saved S9E8_full_original_marked.csv — {n_split} lines flagged for manual editing.")


✅ Saved S9E8_full_original_marked.csv — 12 lines flagged for manual editing.
