In [None]:
import sys
sys.path.append('long-doc-coref/src')
sys.path.append('NLP_CW')
sys.path.append('character_relationship_analysis/data')

import pandas as pd
import numpy as np
import spacy
from spacy.tokens import Span
import nltk
import re
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import tqdm
import json
import utils
import pickle
nltk.download('punkt')
nltk.download('omw-1.4')

# This will also download the SpanBERT model finetuned for Coreference (by Joshi et al, 2020) from Huggingface
from inference.inference import Inference
from  inference.tokenize_doc import *
from transformers import BertTokenizerFast
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
#Bert tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [None]:
#Semantic role labelling model
srl_model = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

In [None]:
#Sentiment Analysis Model
sentiment = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/stanford-sentiment-treebank-roberta.2021-03-11.tar.gz")
sent_analyser = SentimentIntensityAnalyzer()

In [None]:
'''
Function returns sentiment score between -1 (negative) and +1 (positive) 
for a given string.
'''
def get_sentiment(string):
  output = sentiment.predict(string)
  sent_score = (output['probs'][0]-0.5)*2

  return sent_score

In [None]:
#Semantic role labelling
def perform_srl(coref_output, char_pairs, pair_ids):
  #Dataframe to store SRL output
  srl_df = pd.DataFrame(columns = ['pair_id','char_list', 'srl_output'])

  #List of character lists, IDs
  char_lists = []
  char_pair_ids = []

  #List to store the index of where first word of sentence occurs in overall document
  sent_ids = []

  #List of SRL outputs
  srl_outputs = []

  for id in pair_ids:
    char_list = char_pairs[id]
    num_shared_sents = len(coref_doc[id])
    
    #Iterate through shared sentences
    for sent_id in range(num_shared_sents):
      char_lists.append(char_list)
      char_pair_ids.append(id)
      shared_sent = coref_doc[id][sent_id][1]
      sent_idx = coref_doc[id][sent_id][0]
      sent_ids.append(sent_idx)
      sent_tok = nltk.sent_tokenize(shared_sent)
      srl = srl_model.predict(sent_tok[0])
      srl_outputs.append(srl)

  srl_df['pair_id'] = char_pair_ids
  srl_df['char_list'] = char_lists
  srl_df['sent_id'] = sent_ids
  srl_df['srl_output'] = srl_outputs

  return srl_df

In [None]:
'''
Given a segment of output from Semantic Role Labelling, determines if the
ARG0, verb, and ARG1 arguments are all present in the output.
'''
def args_present_1(tag_list):

  #Sequence tags to search for
  arg_0 = 'B-ARG0'
  arg_1 = 'B-ARG1'
  verb = 'B-V'

  if (arg_0 in tag_list) & (arg_1 in tag_list) & (verb in tag_list):
    return True
  else:
    return False

In [None]:
'''
Given a segment of output from Semantic Role Labelling, determines if the
ARG0, verb, and ARG2 arguments are all present in the output.
'''
def args_present_2(tag_list):

  #Sequence tags to search for
  arg_0 = 'B-ARG0'
  arg_2 = 'B-ARG2'
  verb = 'B-V'

  if (arg_0 in tag_list) & (arg_2 in tag_list) & (verb in tag_list):
    return True
  else:
    return False

In [None]:
'''
Given a segment of output from Semantic Role Labelling and the original tokenized sentence,
determines whether there are characters present in both of the object and subject parts of text.
If so, returns a dataframe containing, the subject, object, verb, sentence, sub-sentence and the sentiment scores
for the sentence, sub-sentence, and verb.
'''
def extract_sentiment(text, char_list, word_list, arg_1_present, sent_loc):

  #Dataframe to store character interaction outputs
  char_int_df = pd.DataFrame(columns=['sent_loc', 'subject', 'object', 'sub_sentence', 'verb', 'sentence', 'sub_sent_score', 'verb_score', 'sent_score'])
  
  #Extracting arguments
  arg_0 = re.search('ARG0: (.+?)]', text).group(1)

  #Determining whether to search for object in ARG1 or ARG2
  if arg_1_present:
    arg_1 = re.search('ARG1: (.+?)]', text).group(1)
  else:
    arg_1 = re.search('ARG2: (.+?)]', text).group(1)

  verb = re.search('V: (.+?)]', text).group(1)

  arg_0_search = str(arg_0.lower())
  arg_1_search = str(arg_1.lower())

  #Finding sub-sentence capturing subject, object, and verb
  arg_0_tok = " ".join(get_tokenized_doc(arg_0, tokenizer)['sentences'][0])
  arg_1_tok = " ".join(get_tokenized_doc(arg_1, tokenizer)['sentences'][0])
  verb_tok = " ".join(get_tokenized_doc(verb, tokenizer)['sentences'][0])
  words = " ".join(word_list)
  words_lower_list = [x.lower() for x in word_list]
  words_lower = " ".join(words_lower_list)

  #Lists to store values
  subjects = []
  objects = []
  sub_sents = []
  sents = []
  verbs = []
  verb_scores = []
  sub_sent_scores = []
  sent_scores = []
  sent_locs = []

  #Only extract if both subject and object are characters
  for char_1 in char_list:
    for char_2 in char_list:
      
      char_1_search = str(char_1.lower())
      char_2_search = str(char_2.lower())

      if (char_1_search in arg_0_search) & (char_2_search in arg_1_search) & (char_1_search != char_2_search):

        #Extracting sub-sentence based on the order of which the arguments occur
        arg_0_id = words_lower.index(arg_0.lower())
        arg_1_id = words_lower.index(arg_1.lower())
        verb_tok_id = words.index(verb)

        if arg_0_id != arg_1_id:

          order_dict = {arg_0: arg_0_id, arg_1: arg_1_id, verb: verb_tok_id}

          start = min(order_dict, key=order_dict.get)
          end = max(order_dict, key=order_dict.get)
          
          search_str = r'' + start + '.+?' + end + ''
          
          #Replacing brackets with special characters that break regex function
          search_str = search_str.replace('(', '-')
          search_str = search_str.replace(')', '-')
          search_str = search_str.replace('*', ' ')
          search_str = search_str.replace('/', ' ')
          search_str = search_str.replace('\\', ' ')
          words = words.replace('(', '-')
          words = words.replace(')', '-')
          words = words.replace('*', ' ')
          words = words.replace('/', ' ')
          words = words.replace('\\', ' ')
            
          sub_sent = re.findall(search_str, words, re.IGNORECASE)[0]
          sub_sents.append(sub_sent)
          sents.append(words)

          subjects.append(char_1)
          objects.append(char_2)
          verbs.append(verb)

          #Sentiment Analysis on verb and sub-sentence
          sub_sent_score = get_sentiment(sub_sent)
          verb_score = get_sentiment(verb)
          sent_score = get_sentiment(words)
          sub_sent_scores.append(sub_sent_score)
          sent_scores.append(sent_score)
          verb_scores.append(verb_score)
          sent_locs.append(sent_loc)

  char_int_df['sent_loc'] = sent_locs
  char_int_df['subject'] = subjects
  char_int_df['object'] = objects
  char_int_df['sub_sentence'] = sub_sents
  char_int_df['verb'] = verbs
  char_int_df['sentence'] = sents
  char_int_df['sub_sent_score'] = sub_sent_scores
  char_int_df['verb_score'] = verb_scores
  char_int_df['sent_score'] = sent_scores
     
  return char_int_df

In [None]:
'''
Given a list of characters and dataframe of SRL outputs, returns
a dataframe of all relevant character interactions and sentiment scores
for each interaction.
'''
def process_srl(char_pair_list, char_pair_ids, srl_df):

  #Character interaction dataframe
  interaction_df = pd.DataFrame(columns=['subject', 'object', 'sub_sentence', 'verb', 'sentence', 'sub_sent_score', 'verb_score', 'sent_score'])

  for char_pair_id in char_pair_ids:
    
    char_srl_output = list(srl_df[srl_df['pair_id'] == char_pair_id]['srl_output'])
    sent_locs = list(srl_df[srl_df['pair_id'] == char_pair_id]['sent_id'])
    char_list = list(srl_df[srl_df['pair_id'] == char_pair_id]['char_list'])

    if len(char_list) > 0:
      char_pair = char_list[0]
    else:
      continue

    #Iterate through SRL output for each character pair
    for i, srl_output in enumerate(char_srl_output):
      sent_loc = sent_locs[i]
      words = srl_output['words']
      seq = srl_output['verbs']
      num_seq = len(seq)

      if num_seq > 0:
        for j, sent in enumerate(seq):
          tags = sent['tags']
          text = sent['description']

          #Check that ARG0 and ARG1 are present
          if args_present_1(tags):
            char_int_df = extract_sentiment(text, char_pair, words, True, sent_loc)
            interaction_df = pd.concat([interaction_df, char_int_df])

          #If ARG1 not present, check for ARG0 and ARG2
          elif args_present_2(tags):
            char_int_df = extract_sentiment(text, char_pair, words, False, sent_loc)
            interaction_df = pd.concat([interaction_df, char_int_df])

  return interaction_df

In [None]:
'''
Removes common uncode characters from text
'''
def remove_unicode(data):
  data = data.replace('\\u201c', '')
  data = data.replace('\\u201d', '')
  data = data.replace('\\u2019', '')
  data = data.replace('\\u2014', '')
  data = data.replace('\\u00', '')

  return data

In [None]:
book_list = ['chocolate_factory', 'dracula', 'harry potter book 1', 'peter_pan', 'winnie_the_pooh'] 

#Choose model type
model = 'new_model'

#Dataframe to store character relationship data for all books
char_sent_df = pd.DataFrame(columns=['book', 'subject', 'object', 'sub_sentence', 'verb', 'sentence', 'sub_sent_score', 'verb_score', 'sent_score'])

for book in tqdm.tqdm(book_list):
  #Read character sentence pairs
  with open(f'character_relationship_analysis/data/final/shared sentences/{model}/{book}/pair_replace.json') as f:
    coref_doc = json.load(f)
  coref_doc = re.sub('\n', ' ', coref_doc)

  #Remove common unicode characters
  coref_doc = remove_unicode(coref_doc)

  #Reading as dict
  coref_doc = json.loads(coref_doc)

  #Read character paid IDs
  with open(f'character_relationship_analysis/data/final/shared sentences/{model}/{book}/encoding.json') as f:
    pair_id = json.load(f)
  pair_id = re.sub('\n', ' ', pair_id)

  #Remove common unicode characters
  pair_id = remove_unicode(pair_id)

  #Reading as dict
  char_pair_dict = json.loads(pair_id)

  #Relationship Pair IDs
  pair_ids = coref_doc.keys()

  #SRL Dataframe
  srl_df = perform_srl(coref_doc, char_pair_dict, pair_ids)

  #List of all character pairs
  char_pairs = np.unique(srl_df['char_list']).tolist()

  #Interaction dataframe
  int_df = process_srl(char_pairs, pair_ids, srl_df)

  #Remove duplicate rows
  int_df.drop_duplicates(inplace=True)

  #Interaction Dataframe

  int_df['book'] = book

  char_sent_df = pd.concat([char_sent_df, int_df])

In [None]:
#Vader sentiment scores
char_sent_df['sub_sent_vader'] = char_sent_df['sub_sentence'].apply(lambda x: sent_analyser.polarity_scores(x)['compound'])
char_sent_df['sent_vader'] = char_sent_df['sentence'].apply(lambda x: sent_analyser.polarity_scores(x)['compound'])
char_sent_df['verb_vader'] = char_sent_df['verb'].apply(lambda x: sent_analyser.polarity_scores(x)['compound'])

In [None]:
#Sets of character pairs
char_sent_df['pair'] = char_sent_df.apply(lambda x: sorted([x['subject'], x['object']]),axis=1)
char_sent_df['pair'] = char_sent_df['pair'].apply(lambda x: set(x))

In [None]:
#Map file name to book title
title_dict = {'dracula':'Dracula', 'chocolate_factory': 'Charlie and the Chocolate Factory', 'winnie_the_pooh':'Winnie the Pooh', 'peter_pan':'Peter Pan',
              'harry potter book 1':'Harry Potter Book 1'}
char_sent_df['title'] = char_sent_df['book'].apply(lambda x: title_dict[x])

In [None]:
#Remove duplicates and sort
char_sent_df = char_sent_df.drop_duplicates(subset=['sub_sentence', 'verb', 'subject', 'object'])
char_sent_df.sort_values(by=['title', 'subject', 'object', 'sent_loc'], inplace=True)

In [None]:
char_sent_df.to_csv(f'character_relationship_analysis/data/{model}_all_sentiment_results.csv', index=False)