In [None]:
!git clone https://github.com/jblack97/long-doc-coref.git
!pip install  allennlp==2.4.0 allennlp-models==2.4.0
!pip install spacy==3.2
!python -m spacy download en_core_web_lg

In [None]:
import sys
sys.path.append('long-doc-coref/src')
sys.path.append('NLP_CW')
sys.path.append('character_relationship_analysis/scripts')

import pandas as pd
import numpy as np
import spacy
from spacy.tokens import Span
import nltk
from nltk.tokenize import sent_tokenize
import re
from google.colab import drive
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
import tqdm
import json
import utils
import pickle
drive.mount('/content/gdrive')
nltk.download('punkt')
nltk.download('omw-1.4')

# This will also download the SpanBERT model finetuned for Coreference (by Joshi et al, 2020) from Huggingface
from inference.inference import Inference
from  inference.tokenize_doc import *
from transformers import BertTokenizerFast

In [None]:
#List of files
file_list = ['peter_pan.txt', 'winnie_the_pooh.txt', 'harry_potter_1.txt', 'dracula.txt',
             'charlie_and_the_chocolate_factory.txt']

In [None]:
#Mapping from file name to book title
file_to_title = {'dracula.txt':'Dracula', 'charlie_and_the_chocolate_factory.txt': 'Charlie and the Chocolate Factory', 'winnie_the_pooh.txt':'Winnie the Pooh', 'peter_pan.txt':'Peter Pan',
                 'harry_potter_1.txt':'Harry Potter Book 1'}

In [None]:
#Bert tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [None]:
#Loading spacy model
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 5000000

In [None]:
#List of books
books = []

#List of characters for each book
char_lists = []

for book in tqdm.tqdm(file_list):

  books.append(book)

  #Read book from text file
  with open(f'character_relationship_analysis/data/texts/{book}') as f:
    doc = f.read()
  doc = re.sub('\n', ' ', doc)

  #Named entity extraction
  ner = nlp(doc)
  entity = []
  start_idx = []
  end_idx = []
  ent_type = []

  for ent in ner.ents:
    entity.append(ent.text)
    ent_type.append(ent.label_)
    start_idx.append(ent.start_char)
    end_idx.append(ent.end_char)

  #Character dataframe
  ner_df = pd.DataFrame(list(zip(entity, ent_type, start_idx, end_idx )),
                      columns=['entity', 'entity_type', 'start_idx', 'end_idx'])
  
  #Capitalizing entity names
  ner_df['entity'] = ner_df['entity'].apply(lambda x: x.title())

  #Top 20 most frequent characters mentioned
  char_list = list(ner_df[ner_df['entity_type'] == 'PERSON']['entity'].value_counts()[:20].index)

  char_lists.append(char_list)

In [None]:
#Dataframe of all books and their top 20 characters
char_df = pd.DataFrame(columns=['book', 'character'])
char_book = []
char_name = []

for i, book in enumerate(books):
  for j, char in enumerate(char_lists[i]):
    char_book.append(book)
    char_name.append(char)

char_df['book'] = char_book
char_df['character'] = char_name

In [None]:
'''
Function to process character name tuple and resolve issues such
as possessives being attached to the end of a character name.
Returns string of processed character name.
'''
def process_name_tuple(name_tuple):
  if len(name_tuple) > 1:
    name_str = ''
    for i, c in enumerate(name_tuple):
      if i <  (len(name_tuple) - 1):
        #Resolve possessives
        if ((name_tuple[i] == "’") | ((name_tuple[i] == "'"))) & (name_tuple[i+1]=='S'):
          name_str += ''
          break
        #Ensure space not added before '-' or '.' 
        elif (name_tuple[i+1] == '-') | (name_tuple[i+1] == '.'):
          name_str += c
        #Ensure space not added after '-'
        elif (name_tuple[i] == '-'):
          name_str += c
        else:
          name_str += c + ' '
      else:
        name_str += c
  else:
    name_str = name_tuple[0]

  return name_str.strip()

In [None]:
'''
Given a dataframe of NER output, function reduces some of the issues with NER output, such 
as having multiple entities for one character.
'''
def process_chars(char_df, books):
  titles = ["Dr", "Mr", "Mr.", "Ms", "Ms.", "Miss", "Mrs.", "Mrs", "Monsieur", "Madame"]
  char_df['name_tuple'] =  char_df['character'].apply(lambda x: tuple(utils.inv_map(flatten(get_tokenized_doc(x, tokenizer)['sentences']))))
  new_char_list = []
  new_book_list = []

  for book in books:
    book_chars = []
    chars = list(char_df[char_df.book == book]['name_tuple'])

    for i, char in enumerate(chars):
        
        if len(char) == 1:
          '''
          Replacing forename-surname with forename, if forenames match and forename
          is include in character list (e.g. 'Charles Darnay' -> 'Charles')
          '''
          for j, ref_char in enumerate(chars):
            if i != j:
              if (char[0] == ref_char[0]) & (char[0] not in titles):
                chars[j] = (char[0], '')

        #Removing instances of single letter characters (e.g. K. and I.)
        if len(char) == 2:
          if (len(char[0]) == 1) & (char[1] == '.'):
            chars.remove(char)

      
    new_chars = [process_name_tuple(x) for x in chars]
    
    unique_chars = set(new_chars)
    unique_chars = list(unique_chars) 
    new_char_list.extend(unique_chars)
    new_book_list.extend([book] * len(unique_chars))

  new_char_df = pd.DataFrame(list(zip(new_book_list, new_char_list)),
                      columns=['file', 'character'])
  new_char_df['book'] = new_char_df['file'].apply(lambda x: file_to_title[x])
  return new_char_df

In [None]:
#Processing NER output
unique_books = set(char_book)
new_char_df = process_chars(char_df, unique_books)

In [None]:
#Counting number of sentences with at least two characters
#NOTE: as per the literature, sentences with > 3 characters 
#are not included as interactions between each of the 
#mentioned characters

files = []
shared_sents = []
for book in tqdm.tqdm(file_list):

  #Read book from text file
  with open(f'character_relationship_analysis/data/texts/{book}') as f:
    doc = f.read()
  doc = re.sub('\n', ' ', doc)

  chars = list(new_char_df[new_char_df['file']==book]['character'])

  num_shared = 0

  for sent in sent_tokenize(doc):
    num_chars = sum([int(char in sent) for char in chars])
    if num_chars > 1:
      if num_chars == 2:
        num_shared += 1
      elif num_chars == 3:
        num_shared += 3
      else:
        num_shared += 1

  files.append(book)
  shared_sents.append(num_shared)

shared_sent_df = pd.DataFrame(list(zip(files, shared_sents)),
                      columns=['file', 'shared_sentences'])
shared_sent_df['book'] = shared_sent_df['file'].apply(lambda x: file_to_title[x])

In [None]:
shared_sent_df.to_csv('character_relationship_analysis/data/shared_sentences_no_coref.csv', index=False)
new_char_df.to_csv('character_relationship_analysis/data/book_characters.csv', index=False)