In [None]:
#Installing Packages
!pip install vaderSentiment

In [None]:
#Import modules
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sys.path.append('character_relationship_analysis/data')

sns.set_style('whitegrid', {'axes.linewidth': 1, 'axes.edgecolor':'black'})

In [None]:
#Relationship Sentiment Data
new_df = pd.read_csv('character_relationship_analysis/data/sentiments_new_model.csv')
old_df = pd.read_csv('character_relationship_analysis/data/sentiments_old_model.csv')

#Dataframe of shared sentence counts without coref
shared_no_coref = pd.read_csv('character_relationship_analysis/data/shared_sentences_no_coref.csv')

In [None]:
'''
Create dataframe of total interactions captured for each book
with and without coreference resolution
'''
def num_interactions(new_coref_df, old_coref_df, no_coref_df):

  int_df = pd.DataFrame(columns=['Title', 'Interactions', 'Type'])
  titles = list(np.unique(new_coref_df['title']))
  title_list = []
  ints = []
  types = []
  
  for title in titles:
    int_count_new = new_coref_df[new_coref_df['title'] == title]['pair'].value_counts()
    int_count_old = old_coref_df[old_coref_df['title'] == title]['pair'].value_counts()

    title_list.append(title)
    ints.append(np.sum(int_count_new))
    types.append('Char-Coref')
    title_list.append(title)
    ints.append(np.sum(int_count_old))
    types.append('Long-Doc Coref')

  int_df['Title'] = title_list
  int_df['Interactions'] = ints
  int_df['Type'] = types

  no_coref_df['Type'] = 'No Coref'
  no_coref_df['Interactions'] = no_coref_df['shared_sentences']
  no_coref_df['Title'] = no_coref_df['book']

  no_coref_df = no_coref_df[['Title', 'Interactions', 'Type']]

  int_df = pd.concat([int_df, no_coref_df])

  return int_df

In [None]:
#Dataframe of total interactions
interaction_df = num_interactions(new_df, old_df, shared_no_coref)

In [None]:
#Plotting interaction counts
sns.set_style('whitegrid', {'axes.linewidth': 1, 'axes.edgecolor':'black'})
sns.set_palette(sns.color_palette("tab10"))
bar_plot = sns.barplot(data=interaction_df, x='Title',y='Interactions', hue='Type')
bar_plot.set_xlabel('Book', weight='bold')
bar_plot.set_ylabel('Interactions', weight='bold')
bar_plot.set_title('Interaction Count - Coref vs No Coref', weight='bold')
bar_plot.set_xticklabels(['Ch. & the Choc. Fac', 'Dracula', 'Harry Potter', 'Peter Pan', 'Winnie the Pooh'],rotation = 45,
                         horizontalalignment='right')
sns.set(rc={"figure.figsize":(8, 6)})
sns.set(font_scale = 1.4)
plt.show()

In [None]:
'''
Function to calculate cumulative sentiment scores
and extracts number of interactions per character pair
'''
def cumulative_sentiment(data):

  data.drop_duplicates(subset=['sentence', 'pair', 'sub_sent_vader'], inplace=True)
  data.sort_values(by=['pair', 'sent_loc'], inplace=True)

  #Cumulative Sentiment - Sub-Sentence
  data['vader_sub_sent_cumsum'] = data.groupby('pair')['sub_sent_vader'].cumsum()

  #Cumulative Sentiment - Sentence
  data['vader_sentence_cumsum'] = data.groupby('pair')['sent_vader'].cumsum()

  #Cumulative Sentiment - Verb
  data['vader_verb_cumsum'] = data.groupby('pair')['verb_vader'].cumsum()

  #Character pair interaction count
  int_counts = data['pair'].value_counts()
  num_ints_dict = dict(zip(int_counts.index, int_counts))
  data['num_ints'] = data['pair'].apply(lambda x: num_ints_dict[x])

  return data

In [None]:
'''
Indicator function for determining whether
relationship is in the top 5 in terms of most
interactions for a given book
'''
def top_5(data):
  titles = list(np.unique(data['title']))

  top_5_list = []
  for title in titles:
    int_count = data[data['title'] == title]['pair'].value_counts()
    book_top_5 = list(int_count.index[:5])
    top_5_list.extend(book_top_5)

  data['top_5'] = data.apply(lambda x: True if x['pair'] in top_5_list else False, axis=1)

  return data

In [None]:
#Processing dataframes
new_df = cumulative_sentiment(new_df)
old_df = cumulative_sentiment(old_df)
new_df = top_5(new_df)
old_df = top_5(old_df)

In [None]:
#Creating dataframe for joint plot of sentiment model distributions
sentiment_models = pd.DataFrame(columns=['model', 'score'])
vader_sent_scores = list(new_df['sub_sent_vader'])
model_vader = ['Vader'] * len(vader_sent_scores)
roberta_sent_scores = list(new_df['sub_sent_score'])
model_roberta = ['RoBERTa'] * len(vader_sent_scores)

models = model_vader
models.extend(model_roberta)

scores = vader_sent_scores
scores.extend(roberta_sent_scores)

sentiment_models['Model'] = models
sentiment_models['score'] = scores

In [None]:
#Distribution plots for sentiment analysis model scores
sns.set_style('whitegrid', {'axes.linewidth': 1, 'axes.edgecolor':'black'})
sns.set_palette(sns.color_palette("tab10"))
model_dist = sns.histplot(data=sentiment_models, x='score', hue = 'Model')
model_dist.set_xlabel('Sentiment Score', weight='bold')
model_dist.set_ylabel('Count', weight='bold')
model_dist.set_title('Sentiment Score Distribution', weight='bold')
sns.set(rc={"figure.figsize":(8, 6)})
sns.set(font_scale = 1.4)
plt.show()

In [None]:
#New Model Plots - Sub-sentences
titles = list(np.unique(new_df['title']))

for title in titles:
  plot_data = new_df[(new_df['title'] == title) & (new_df['top_5'] == True)]
  plot_data.sort_values(by=['pair', 'sent_loc'], inplace=True)

  #Cumulative sentiment plots
  sns.set_palette(sns.color_palette("tab10"))
  sns.set_style('whitegrid', {'axes.linewidth': 1, 'axes.edgecolor':'black'})
  model_dist = sns.lineplot(data=plot_data, x = 'sent_loc', y='vader_sub_sent_cumsum', hue = 'pair')
  plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
  model_dist.set(xlabel='Sentence Index', ylabel='Cumulative Sentiment (sub-sentence)', title = f'{title} Relationships - Char-Coref')
  sns.set(font_scale = 1.4)
  plt.show()

In [None]:
#New Model Plots - Sentences
titles = list(np.unique(new_df['title']))

for title in titles:
  plot_data = new_df[(new_df['title'] == title) & (new_df['top_5'] == True)]
  plot_data.sort_values(by=['pair', 'sent_loc'], inplace=True)

  #Cumulative sentiment plots
  sns.set(font_scale = 1.4)
  sns.set_palette(sns.color_palette("tab10"))
  sns.set_style('whitegrid', {'axes.linewidth': 1, 'axes.edgecolor':'black'})
  model_dist = sns.lineplot(data=plot_data, x = 'sent_loc', y='vader_sentence_cumsum', hue = 'pair')
  plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
  model_dist.set_xlabel('Sentence Index', weight='bold')
  model_dist.set_ylabel('Cumulative Sentiment (sentence)', weight='bold')
  model_dist.set_title(f'{title} Relationships - Char-Coref', weight='bold')
  plt.show()

In [None]:
#Old Model Plots - Sub-Sentences
titles = list(np.unique(old_df['title']))

for title in titles:
  plot_data = old_df[(old_df['title'] == title) & (old_df['top_5'] == True)]
  plot_data.sort_values(by=['pair', 'sent_loc'], inplace=True)

  if title == 'Harry Potter Book 1 - old':
    title = 'Harry Potter Book 1'

  #Cumulative sentiment plots
  sns.set_palette(sns.color_palette("tab10"))
  sns.set_style('whitegrid', {'axes.linewidth': 1, 'axes.edgecolor':'black'})
  model_dist = sns.lineplot(data=plot_data, x = 'sent_loc', y='vader_sub_sent_cumsum', hue = 'pair')
  plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
  model_dist.set(xlabel='Sentence Index', ylabel='Cumulative Sentiment (sub-sentence)', title = f'{title} Relationships - Long-Doc Coref')
  sns.set(font_scale = 1.4)
  plt.show()

In [None]:
#Old Model Plots - Sentences
titles = list(np.unique(old_df['title']))

for title in titles:
  plot_data = old_df[(old_df['title'] == title) & (old_df['top_5'] == True)]
  plot_data.sort_values(by=['pair', 'sent_loc'], inplace=True)

  if title == 'Harry Potter Book 1 - old':
    title = 'Harry Potter Book 1'
    
  #Cumulative sentiment plots
  sns.set_palette(sns.color_palette("tab10"))
  sns.set_style('whitegrid', {'axes.linewidth': 1, 'axes.edgecolor':'black'})
  model_dist = sns.lineplot(data=plot_data, x = 'sent_loc', y='vader_sentence_cumsum', hue = 'pair')
  plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
  model_dist.set(xlabel='Sentence Index', ylabel='Cumulative Sentiment (sentence)', title = f'{title} Relationships - Long-Doc Coref')
  sns.set(font_scale = 1.4)
  plt.show()