## Pip Installs and Imports

In [1]:
pip install convokit

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from convokit import Corpus, download
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import time

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import Binarizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\joda\AppData\Roaming\Python\Python311\site-packages\torch\lib\shm.dll" or one of its dependencies.

In [None]:
# Import of the movie-corpus
corpus = Corpus(filename=download("movie-corpus"))

## Explorative Data Anlyses

We afe the data in pandas dataframes and explore it. We drop data that is not labeled correctly.

In [None]:
corpus.print_summary_stats()

In [None]:
utterances = corpus.get_utterances_dataframe() 

In [None]:
utterances.head()

In [None]:
utterances.info()

In [None]:
conversations = corpus.get_conversations_dataframe() 

In [None]:
conversations.head()

In [None]:
conversations.info()

In [None]:
conversations.describe()

In [None]:
characters = corpus.get_speakers_dataframe() 

In [None]:
characters.head()

In [None]:
characters['meta.gender'].value_counts()

In [None]:
characters['meta.gender'] = characters['meta.gender'].str.lower()
characters['meta.gender'].value_counts()

In [None]:
print(characters['meta.gender'].unique())

In [None]:
rows_to_drop = characters[characters['meta.gender'] == '?'].index
characters = characters.drop(rows_to_drop)
characters['meta.gender'].value_counts()

## Preperation of the Data


In [None]:
# Merging the dataframes
merged_df = pd.merge(utterances, conversations, left_on='conversation_id', right_on='id', suffixes=('_utterance', '_conversation'))
merged_df.head()

In [None]:
final_df = pd.merge(merged_df, characters, left_on='speaker', right_on='id', how='left', suffixes=('_merged', '_character'))
final_df.head()

In [None]:
final_df['speaker'].value_counts() 

In [None]:
# Adding a column with the number of lines spoken by each character
final_df['line_count'] = final_df.groupby('speaker')['speaker'].transform('size')
final_df.head()

In [None]:
final_df.shape

In [None]:
print(final_df.isnull().sum())

In [None]:
# Dropping rows with missing values in the 'meta.gender' column
final_df = final_df.dropna(subset=['meta.gender'])
final_df['meta.gender'].isnull().sum()

In [None]:
final_df['meta.gender'].value_counts()

In [None]:
# Drop irrelevant / empty columns
final_df = final_df.drop(columns=['timestamp'])
final_df = final_df.drop(columns=['reply_to'])
print(final_df.isnull().sum()) 

In [None]:
final_df['meta.credit_pos'].value_counts()

In [None]:
# Label empty values as '10+'
final_df['meta.credit_pos'] = final_df['meta.credit_pos'].apply(lambda p: '10+' if not p in ['1', '2', '3', '4', '5', '6', '7', '8', '9'] else p) 
final_df['meta.credit_pos'].value_counts()

In [None]:
# Adding a column with the length of lines
final_df['lineLength'] = final_df['text'].str.len()             
final_df['wordCountLine'] = final_df['text'].str.count(' ') + 1 
final_df.head()

In [None]:
final_df['medianLineLength'] = final_df.groupby('speaker')['lineLength'].transform('median')
final_df['medianWordCountLine'] = final_df.groupby('speaker')['wordCountLine'].transform('median')

In [None]:
final_df['meta.rating'] = pd.to_numeric(final_df['meta.rating'], errors='coerce')
final_df['meta.votes'] = pd.to_numeric(final_df['meta.votes'], errors='coerce')
final_df['meta.release_year'] = pd.to_numeric(final_df['meta.release_year'], errors='coerce').astype('Int64')

final_df.info()

In [None]:
final_df.shape

In [None]:
# Correcting Labeling
m_rows = final_df[final_df['meta.gender'] == 'm']

non_m_rows = final_df[final_df['meta.gender'] != 'm']

m_rows_sampled = m_rows.sample(n=70768, random_state=42)

final_df = pd.concat([m_rows_sampled, non_m_rows])

In [None]:
final_df.shape

In [None]:
final_df['meta.gender'].value_counts()

## Visualizations of the Data

In [None]:
# Plot gender differences in line count and word count per line
gender_stats = final_df.groupby('meta.gender').agg(
    total_lines=('line_count', 'sum'),
    avg_word_count=('wordCountLine', 'mean')
).reset_index()

print(gender_stats)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

axes[0].bar(gender_stats['meta.gender'], gender_stats['total_lines'], color=['blue', 'pink'])
axes[0].set_title('Total Number of Lines per Gender')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Total Number of Lines')

axes[1].bar(gender_stats['meta.gender'], gender_stats['avg_word_count'], color=['blue', 'pink'])
axes[1].set_title('Average Word Count per Line per Gender')
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Average Word Count per Line')

plt.tight_layout()
plt.show()

In [None]:
# Plot gender differences in line length
plt.figure(figsize=(14, 8))
sns.boxplot(x='meta.gender', y='lineLength', data=final_df, palette={"m": "pink", "f": "blue"})
plt.title('Line Length Distribution by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=14)
plt.ylabel('Line Length', fontsize=14)
plt.ylim(0, 500)  
plt.show()

In [None]:
# Plotting the count of characters
posCredit_gender_counts = final_df.groupby(['meta.credit_pos', 'meta.gender']).size().unstack(fill_value=0)

print(posCredit_gender_counts)

posCredit_gender_counts.plot(kind='bar', figsize=(10, 6), color=['blue', 'pink'])
plt.title('Count of Characters by posCredit and Gender')
plt.xlabel('posCredit')
plt.ylabel('Count of Characters')
plt.legend(title='Gender')
plt.xticks(rotation=0)  
plt.tight_layout()
plt.show()

In [None]:
# Plotting the ratings
plt.figure(figsize=(30, 15))
sns.histplot(final_df['meta.rating'], bins=20, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plotting the frequences
conversation_lengths = final_df['conversation_id'].value_counts()

plt.figure(figsize=(20, 12))
sns.histplot(conversation_lengths, bins=30, kde=True)
plt.title('Distribution of Conversation Lengths')
plt.xlabel('Number of Utterances per Conversation')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Pairplotting the numerical features
sns.pairplot(final_df[['line_count', 'lineLength',	'wordCountLine']])
plt.show()

## Preprocessing of the Data

In [None]:
# Encode the labels
label_encoder = LabelEncoder()

final_df['gender_encoded'] = label_encoder.fit_transform(final_df['meta.gender'])

category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(category_mapping)

In [None]:
# Preprocess the text
stop_words = set(stopwords.words('english'))
stop_words.update([',', '.', '?', ':', ';', "'", '“', '”', '!', '’', '...', '....', '--', '_'])

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [None]:
final_df['processed_text'] = final_df['text'].apply(preprocess_text)

## Pre - Models

Vectorization

In [None]:
# Vectorize the text
vectorizer = TfidfVectorizer(max_features=5000) 
X_text = vectorizer.fit_transform(final_df['processed_text'])

Encoding

In [None]:
le = LabelEncoder()
final_df['gender_label'] = le.fit_transform(final_df['meta.gender'])
y = final_df['gender_label']

In [None]:
credit_pos_encoder = LabelEncoder()
final_df['encoded_credit_pos'] = credit_pos_encoder.fit_transform(final_df['meta.credit_pos'])

Train the Model

In [None]:
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

X_other = final_df[['line_count', 'encoded_credit_pos', 'medianLineLength', 'medianWordCountLine']]

X_other_sparse = csr_matrix(X_other.values)
X_combined = hstack([X_text, X_other_sparse])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

## Naive Bayers

In [None]:
# Naive Bayes Classifier
start_time = time.time()
nb_clf = MultinomialNB()
nb_clf.fit(X_train, y_train)

nb_y_pred = nb_clf.predict(X_test)

accuracy = accuracy_score(y_test, nb_y_pred)

print(f"Accuracy: {accuracy:.2f}")

end_time = time.time()

total_time = end_time - start_time
print(f"Total Execution Time: {total_time} seconds")

In [None]:
vectorizer = TfidfVectorizer(max_features=5000) 
X_text_1 = vectorizer.fit_transform(final_df['processed_text'])

binarizer = Binarizer()
X_binary = binarizer.fit_transform(X_text_1.toarray())

In [None]:
X_other_1 = final_df[['line_count', 'encoded_credit_pos', 'medianLineLength', 'medianWordCountLine']]

X_other_sparse_1 = csr_matrix(X_other_1.values)
X_combined_1 = hstack([X_text_1, X_other_sparse_1])

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_combined_1, y, test_size=0.2, random_state=42)

In [None]:
# Bernoulli Naive Bayes Classifier
start_time = time.time()

bnb = BernoulliNB()

bnb.fit(X_train_1, y_train_1)

y_pred = bnb.predict(X_test_1)

accuracy = accuracy_score(y_test_1, y_pred)

print(f"Accuracy: {accuracy:.2f}")

end_time = time.time()

total_time = end_time - start_time
print(f"Total Execution Time: {total_time} seconds")

## Logistic Regression

In [None]:
# Loogistic Regression Classifier
start_time = time.time()

lr_clr = LogisticRegression(solver = 'lbfgs', n_jobs=-1) 
lr_clr.fit(X_train, y_train)

lr_y_pred = lr_clr.predict(X_test)

accuracy = accuracy_score(y_test, lr_y_pred)

print(f"Accuracy: {accuracy:.2f}")

end_time = time.time()

total_time = end_time - start_time
print(f"Total Execution Time: {total_time} seconds")

# Random Forest 

In [None]:
# Random Forest Classifier
start_time = time.time()

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

rf_y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, rf_y_pred)

print(f"Accuracy: {accuracy:.2f}")

end_time = time.time()

total_time = end_time - start_time
print(f"Total Execution Time: {total_time} seconds")

# Cross Validation

In [None]:
# Cross-validation
nb_scores = cross_val_score(nb_clf, X_combined, y, cv=5, scoring='accuracy')
print("Cross-validation scores for Multinomial Naive Bayes:", nb_scores)
print("Mean accuracy:", nb_scores.mean())

bnb_scores = cross_val_score(bnb, X_combined_1, y, cv=5, scoring='accuracy')
print("Cross-validation scores for Bernoulli Naive Bayes:", bnb_scores)
print("Mean accuracy:", bnb_scores.mean())

lr_scores = cross_val_score(lr_clr, X_combined, y, cv=5, scoring='accuracy')
print("Cross-validation scores for Logistic Regression:", lr_scores)
print("Mean accuracy:", lr_scores.mean())

rf_scores = cross_val_score(rf, X_combined, y, cv=5, scoring='accuracy')
print("Cross-validation scores for Random Forest:", rf_scores)
print("Mean accuracy:", rf_scores.mean())

plt.figure(figsize=(12, 6))
plt.plot(range(1, 6), nb_scores, label='Multinomial Naive Bayes', marker='o')
plt.plot(range(1, 6), bnb_scores, label='Bernoulli Naive Bayes', marker='o')
plt.plot(range(1, 6), lr_scores, label='Logistic Regression', marker='o')
plt.plot(range(1, 6), rf_scores, label='Random Forest', marker='o')
plt.title('Cross-validation Scores')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()
plt.show()




In [None]:
# Plotting the cross-validation scores
all_scores = [nb_scores, bnb_scores, lr_scores, rf_scores]

plt.figure(figsize=(10, 6))
plt.boxplot(all_scores, labels=['Multinomial Naive Bayes', 'Bernoulli Naive Bayes', 'Logistic Regression', 'Random Forest'])
plt.title('Cross-validation Scores')
plt.xlabel('Classifier')
plt.ylabel('Accuracy')
plt.grid()
plt.show()