## Pip Installs and Imports

In [2]:
# pip install convokit

In [18]:
from convokit import Corpus, download
import pandas as pd

## Exploritive Data Anlyses

In [19]:
corpus = Corpus(filename=download("movie-corpus"))

Downloading movie-corpus to /Users/francescasalute/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


In [20]:
corpus.print_summary_stats()

Number of Speakers: 9035
Number of Utterances: 304713
Number of Conversations: 83097


In [21]:
utterances = corpus.get_utterances_dataframe()

In [22]:
utterances.head()

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.movie_id,meta.parsed,vectors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
L1045,,They do not!,u0,L1044,L1044,m0,"[{'rt': 1, 'toks': [{'tok': 'They', 'tag': 'PR...",[]
L1044,,They do to!,u2,,L1044,m0,"[{'rt': 1, 'toks': [{'tok': 'They', 'tag': 'PR...",[]
L985,,I hope so.,u0,L984,L984,m0,"[{'rt': 1, 'toks': [{'tok': 'I', 'tag': 'PRP',...",[]
L984,,She okay?,u2,,L984,m0,"[{'rt': 1, 'toks': [{'tok': 'She', 'tag': 'PRP...",[]
L925,,Let's go.,u0,L924,L924,m0,"[{'rt': 0, 'toks': [{'tok': 'Let', 'tag': 'VB'...",[]


In [23]:
utterances.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304713 entries, L1045 to L666256
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   timestamp        0 non-null       object
 1   text             304713 non-null  object
 2   speaker          304713 non-null  object
 3   reply_to         221616 non-null  object
 4   conversation_id  304713 non-null  object
 5   meta.movie_id    304713 non-null  object
 6   meta.parsed      304713 non-null  object
 7   vectors          304713 non-null  object
dtypes: object(8)
memory usage: 20.9+ MB


In [24]:
conversations = corpus.get_conversations_dataframe()

In [25]:
conversations.head()

Unnamed: 0_level_0,vectors,meta.movie_idx,meta.movie_name,meta.release_year,meta.rating,meta.votes,meta.genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L1044,[],m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
L984,[],m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
L924,[],m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
L870,[],m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
L866,[],m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"


In [26]:
conversations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 83097 entries, L1044 to L666256
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   vectors            83097 non-null  object
 1   meta.movie_idx     83097 non-null  object
 2   meta.movie_name    83097 non-null  object
 3   meta.release_year  83097 non-null  object
 4   meta.rating        83097 non-null  object
 5   meta.votes         83097 non-null  object
 6   meta.genre         83097 non-null  object
dtypes: object(7)
memory usage: 5.1+ MB


In [27]:
conversations.describe()

Unnamed: 0,vectors,meta.movie_idx,meta.movie_name,meta.release_year,meta.rating,meta.votes,meta.genre
count,83097,83097,83097,83097,83097.0,83097,83097
unique,1,617,617,88,62.0,614,290
top,[],m289,casino,1999,7.2,111223,['drama']
freq,83097,338,338,6646,3694.0,338,5027


## Choosing only one meta.genre for movie

In [28]:
# Downloading conversations as an excel 
with pd.ExcelWriter('movies_data.xlsx') as writer:
    conversations.to_excel(writer, sheet_name='Conversations', index=False)

print("Data saved to movies_data.xlsx")

# From here the dataset can be modified manually choosing only one genre from the list in mets.genre
# The choice can be more accurate because one genre is often prominent in a movie and a human can detect it but time consuming

Data saved to movies_data.xlsx


In [29]:
# Other way to proceed is automatization so to choose only first genre
# Function to extract the first genre

def extract_first_genre(meta_genre):
    if isinstance(meta_genre, str) and meta_genre.startswith('[') and meta_genre.endswith(']'):
        genres = meta_genre.strip("[]").replace("'", "").split(", ") # Creates a proper python list
        if genres:
            return genres[0] # Takes only the first element of the list
    return meta_genre

# Apply the function to the 'meta.genre' column
conversations['meta.genre'] = conversations['meta.genre'].apply(extract_first_genre)

conversations.head()

Unnamed: 0_level_0,vectors,meta.movie_idx,meta.movie_name,meta.release_year,meta.rating,meta.votes,meta.genre
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L1044,[],m0,10 things i hate about you,1999,6.9,62847,comedy
L984,[],m0,10 things i hate about you,1999,6.9,62847,comedy
L924,[],m0,10 things i hate about you,1999,6.9,62847,comedy
L870,[],m0,10 things i hate about you,1999,6.9,62847,comedy
L866,[],m0,10 things i hate about you,1999,6.9,62847,comedy


In [31]:
conversations.describe()

Unnamed: 0,vectors,meta.movie_idx,meta.movie_name,meta.release_year,meta.rating,meta.votes,meta.genre
count,83097,83097,83097,83097,83097.0,83097,83097
unique,1,617,617,88,62.0,614,18
top,[],m289,casino,1999,7.2,111223,action
freq,83097,338,338,6646,3694.0,338,18998


## Merge the Dataframes

In [33]:
# Merge the DataFrames on 'conversation_id' from utterances and 'id' from conversations
merged_df = pd.merge(utterances, conversations, left_on='conversation_id', right_on='id', suffixes=('_utterance', '_conversation'))
merged_df.head()

Unnamed: 0,timestamp,text,speaker,reply_to,conversation_id,meta.movie_id,meta.parsed,vectors_utterance,vectors_conversation,meta.movie_idx,meta.movie_name,meta.release_year,meta.rating,meta.votes,meta.genre
0,,They do not!,u0,L1044,L1044,m0,"[{'rt': 1, 'toks': [{'tok': 'They', 'tag': 'PR...",[],[],m0,10 things i hate about you,1999,6.9,62847,comedy
1,,They do to!,u2,,L1044,m0,"[{'rt': 1, 'toks': [{'tok': 'They', 'tag': 'PR...",[],[],m0,10 things i hate about you,1999,6.9,62847,comedy
2,,I hope so.,u0,L984,L984,m0,"[{'rt': 1, 'toks': [{'tok': 'I', 'tag': 'PRP',...",[],[],m0,10 things i hate about you,1999,6.9,62847,comedy
3,,She okay?,u2,,L984,m0,"[{'rt': 1, 'toks': [{'tok': 'She', 'tag': 'PRP...",[],[],m0,10 things i hate about you,1999,6.9,62847,comedy
4,,Let's go.,u0,L924,L924,m0,"[{'rt': 0, 'toks': [{'tok': 'Let', 'tag': 'VB'...",[],[],m0,10 things i hate about you,1999,6.9,62847,comedy
