In [4]:
import pandas as pd

In [5]:
# Load your dataset
df = pd.read_csv("Data/FRIENDS_SCRIPT.csv")

# Add a unique global ID
df["line_id"] = range(1, len(df) + 1)

# (Optional) Move it to the first column
df = df[["line_id"] + [c for c in df.columns if c != "line_id"]]

# Save as backup (always good before filtering)
df.to_csv("friends_script_with_id.csv", index=False)


In [10]:
import pandas as pd

df = pd.read_csv("friends_script_with_id.csv")

# Filter lines with colon but exclude scene directions
filtered = df[
    df["text"].str.contains(":", na=False)
    & (df["speaker"].str.lower() != "scene directions")
]

# Save the filtered lines
filtered.to_csv("friends_lines_with_colon.csv", index=False)

print(f"Saved {len(filtered)} lines with colon (excluding scene directions).")


Saved 284 lines with colon (excluding scene directions).


In [None]:
# Load both files
df = pd.read_csv("friends_script_with_id.csv")
eval_df = pd.read_csv("friends_lines_evaluated.csv")

# Merge on line_id
merged = df.merge(eval_df, on="line_id", how="left")

# Save the merged dataset
merged.to_csv("friends_script_with_eval.csv", index=False)

print("✅ Merged file saved as friends_script_with_eval.csv")


✅ Merged file saved as friends_script_with_eval.csv


In [None]:

df = pd.read_csv("friends_script_with_id.csv")
eval_df = pd.read_csv("friends_lines_evaluated.csv")

print("Script file line_id type:", df["line_id"].dtype)
print("Eval file line_id type:", eval_df["line_id"].dtype)
print("Total lines in script:", len(df))
print("Total lines evaluated:", len(eval_df))
print("Overlap count:", df["line_id"].isin(eval_df["line_id"]).sum())


Script file line_id type: int64
Eval file line_id type: int64
Total lines in script: 67373
Total lines evaluated: 284
Overlap count: 284


In [3]:
merged[merged["evaluation"].notna()].head(10)

Unnamed: 0,line_id,text,speaker,season,episode,scene,utterance,EPISODE_ID,evaluation
39,40,"Oh really, so that hysterical phone call I got...",Monica Geller,1,1,1,40,S1E1,ok
55,56,"Oh God... well, it started about a half hour b...",Rachel Green,1,1,1,56,S1E1,ok
85,86,"Oh God, is it 6:30? Buzz him in!",Monica Geller,1,1,2,28,S1E1,ok
88,89,Maybe. Joey: Wait. Your 'not a real date' toni...,Monica Geller,1,1,2,31,S1E1,not ok
331,332,"Yeah, well, word of advice: Bring back the com...",Rachel Green,1,2,1,9,S1E2,ok
671,672,Listen. As someone who's seen more than her fa...,Paula,1,3,4,2,S1E3,ok
724,725,Fantastic! I have one question: How is that po...,Monica Geller,1,3,6,4,S1E3,ok
892,893,"Hey! Here's the birthday boy! Ross, check it o...",Joey Tribbiani,1,4,2,15,S1E4,ok
1112,1113,"Okay, Monica: Right foot red.",Ross Geller,1,4,16,1,S1E4,ok
1116,1117,"Okay, Pheebs: Right hand blue. Good.",Ross Geller,1,4,16,5,S1E4,ok


In [4]:
merged.to_csv("friends_script_with_eval.csv", index=False)

In [None]:
df = pd.read_csv("friends_script_with_eval.csv")
print(df.columns)
print(df.head())

Index(['line_id', 'text', 'speaker', 'season', 'episode', 'scene', 'utterance',
       'EPISODE_ID', 'evaluation'],
      dtype='object')
   line_id                                               text  \
0        1  There's nothing to tell! He's just some guy I ...   
1        2  C'mon, you're going out with the guy! There's ...   
2        3  All right Joey, be nice. So does he have a hum...   
3        4                           Wait, does he eat chalk?   
4        5                         (They all stare, bemused.)   

            speaker  season  episode  scene  utterance EPISODE_ID evaluation  
0     Monica Geller       1        1      1          1       S1E1        NaN  
1    Joey Tribbiani       1        1      1          2       S1E1        NaN  
2     Chandler Bing       1        1      1          3       S1E1        NaN  
3     Phoebe Buffay       1        1      1          4       S1E1        NaN  
4  Scene Directions       1        1      1          5       S1E1        NaN

In [8]:
df = pd.read_csv("friends_script_with_eval.csv")

# Filter only the lines marked as 'not ok'
not_ok = df[df["evaluation"] == "not ok"]

# Display selected columns so it’s easier to read
pd.set_option("display.max_rows", None)   # show all if you want
pd.set_option("display.max_colwidth", None)  # don't truncate text

print(not_ok[["line_id", "speaker", "text"]])


       line_id            speaker  \
88          89      Monica Geller   
2200      2201        Ross Geller   
2423      2424      Chandler Bing   
3672      3673      Fireman No. 2   
4659      4660       Rachel Green   
4661      4662       Rachel Green   
4679      4680        Ross Geller   
4756      4757      Chandler Bing   
4790      4791          Aunt Iris   
4853      4854       Rachel Green   
4924      4925       Rachel Green   
6474      6475       Rachel Green   
10022    10023     Joey Tribbiani   
29626    29627        Ross Geller   
33994    33995       The Croupier   
56131    56132      Monica Geller   
56193    56194      Chandler Bing   
56473    56474        Judy Geller   
56489    56490     Joey Tribbiani   
56634    56635              David   
56636    56637              David   
56683    56684              Sandy   
57049    57050     Joey Tribbiani   
57081    57082      Phoebe Buffay   
57167    57168      Woman At Door   
57190    57191          Amy Green   
5