In [None]:
import os
import re
import pickle
from os import path
import pandas as pd
import numpy as np
import json
from collections import Counter

# implementation

## set up

In [None]:
!pip install booknlp

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
from booknlp.booknlp import BookNLP
model_params={
		"pipeline":"entity,quote,event,coref", 
		"model":"small" 
	}

booknlp = BookNLP("en", model_params)

In [None]:
!unzip -q /content/light_novel_original.zip

In [None]:
# get the list of series in the corpus
series_list = os.listdir(f"/content/light_novel")
series_list = [i for i in series_list if not i.startswith(".")]

## methods

In [None]:
import re

In [None]:
def find_last_digit(string):
  # return the number at the end of a string
  # the numbers can be at most two digits
  # e.g. find_last_digit("string12") = 12
  #      find_last_digit("string2") = 2

  s = re.findall("[0-9]{1,2}", string)
  return int(s[-1])

In [None]:
def sort_chapters(chapter_list):
  # sort a list of chapters in ascending order
  
  return sorted(chapter_list,key = find_last_digit)

In [None]:
def proc(filename):
  # loading a file and return in json format
  
  with open(filename) as file:
    data=json.load(file)
  return data

In [None]:
def check_in_mention(current_mention, target_mention):
  # check if the current character is one of the target characters by matching mentions

  current_mention_list = [i["n"] for i in current_mention]
  for i in current_mention_list:
    for key,a in target_mention.items():
      for b in a:
        if i == b:
          return key
  return -1

In [None]:
def get_counter_from_dependency_list(dep_list):
    # return a dictionary in the form of token: token counts
    counter=Counter()
    for token in dep_list:
        term=token["w"]
        tokenGlobalIndex=token["i"]
        counter[term]+=1
    return counter

In [None]:
def make_lemma(word):
  doc = nlp(word)
  a = ""
  for x in range(len(doc)):
    if x == 0:
      a += doc[x].lemma_
    else:
      a += " "
      a += doc[x].lemma_
  return a

In [None]:
def get_agent_patient(data,target_mentions):
  # get agent and patient information of a list of target charcters

  character_agent_patient = []
  for character in data["characters"]:
    
    agentList=character["agent"]
    patientList=character["patient"]


    mentions=character["mentions"]
    proper_mentions=mentions["proper"]

    character_information = {}

    # check if the characters have proper mentions first, then check whether the character is one of the target characters
    if len(mentions["proper"]) > 0 and check_in_mention(proper_mentions,target_mentions) != -1:

        character_information["name"] = check_in_mention(proper_mentions,target_mentions)

        printTop=None

        agent_dict = {}
        patient_dict = {}

        for k, v in get_counter_from_dependency_list(agentList).most_common(printTop):
            k = make_lemma(k)
            agent_dict[k] = v
       

        for k, v in get_counter_from_dependency_list(patientList).most_common(printTop):
            k = make_lemma(k)
            patient_dict[k] = v
       

        character_information["agent"] = agent_dict
        character_information["patient"] = patient_dict
    if character_information != {}:
      character_agent_patient.append(character_information)
      
  return character_agent_patient

In [None]:
def calculate_power(character):
  # calculate the power of a character using the power frames lexicon
  # input: a dictionary "character_information" (as in the get_agent_patient function)

  agent = character["agent"]
  patient = character["patient"]

  length = 0
  power = 0

  # verbs for which the character is the agent
  for i in list(agent.keys()):
    length += agent[i]
    if i in list(agency_power["verb"]):
      index = list(agency_power["verb"]).index(i)
      if index != -1:
        if agency_power["power"][index] == "power_agent":
          power += 1 * agent[i]
        elif agency_power["power"][index] == "power_theme":
          power -= 1 * agent[i]

  # verbs for which the character is the patient
  for i in list(patient.keys()):
    length += patient[i]
    if i in list(agency_power["verb"]):
      index = list(agency_power["verb"]).index(i)
      if index != -1:
        if agency_power["power"][index] == "power_agent":
          power -= 1 * patient[i]
        elif agency_power["power"][index] == "power_theme":
          power += 1 * patient[i]
  try:
    normalized_power = power/length # normalize the power
  except:
    normalized_power = "/"

  return normalized_power,agent,patient

## For each volume, get the power scores for the major characters from agent and patient verbs, and store them in a dataframe along with general information about the volume and the characters

In [None]:
# the names for the splitted parts in the volume that are also shown in the name of output files by booknlp
five_parts = ["splitted_textaa","splitted_textab","splitted_textac","splitted_textad","splitted_textae"]

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
import re
r = re.compile(r'output_splitted.*')

In [None]:
my_dict = pickle.load(open('/content/all_dict.pickle', 'rb'))
agency_power = pd.read_csv("/content/agency_power_lemma.csv")

In [None]:
for series in series_list:


  volume_list = os.listdir(f"light_novel/{series}")
  volume_list = [i for i in volume_list if i.startswith('Volume')]

  print(f"processing {series}")

  for volume in volume_list:
    print(f"processing {series} {volume}")

    # get the chapter lists for the volume
    chapter_list = os.listdir(f"light_novel/{series}/{volume}")
    chapter_list = [i for i in chapter_list if not i.startswith('.')]

    # write the full text (whole book) into a file
    fulltext = ""
    for i in sort_chapters(chapter_list):
      with open(f"light_novel/{series}/{volume}/{i}") as f:
        fulltext += f.read()

    with open(f"light_novel/{series}/{volume}/full_text", 'w') as f:
      f.write(fulltext)
    
    # split the full texts into five nearly equal parts
    a = f"/content/light_novel/{series}/{volume}/full_text"
    b = f"/content/light_novel/{series}/{volume}/splitted_text"

    !split -n l/5 {a} {b}

    # process the whole book with booknlp
    inputFile=f"/content/light_novel/{series}/{volume}/full_text"
    outputDir=f"/content/light_novel/{series}/{volume}/{series}_{volume}_output"
    idd=f"{series}_{volume}"

    booknlp.process(inputFile, outputDir, idd)

    # get the 9 characters with the largest number of mention counts (these are the major characters)
    data=proc(f"{outputDir}/{idd}.book")
    character_count={}
    for i in data["characters"]:
      character_id=i["id"]
      mention = i["mentions"]["proper"]
      count=i["count"]
      if len(mention) > 0:
        character_count[character_id] = count


      top_9 = sorted(list(character_count.values()))[-9:]
      # record the ids for the top 9 major characters
      max_keys = [k for k, v in character_count.items() if v in top_9]


      
    character_list = []
    mentions = {}

    for i in data["characters"]:

      character_information = {}
      mention = i["mentions"]["proper"] # record proper mention of the character
      referential_gender_distribution=referential_gender_prediction="unknown"

      if i["g"] is not None and i["g"] != "unknown":
          referential_gender_distribution=i["g"]["inference"]
          referential_gender=i["g"]["argmax"] # record the referential gender of the character by taking argmax

      # check if is one of the major characters by matching the id
      if len(mention) >0 and i["id"] in max_keys:
        max_proper_mention=mention[0]["n"]
        character_information["name"] = max_proper_mention # set the name to be the proper mention that is used most often
        character_information["gender"] = referential_gender # set the gender
        if character_information != {}:
          character_list.append(character_information) # add information of the character to the character_list
        # record all proper mentions for the character, and save this to the dictionary "mentions"
        mentions[mention[0]["n"]] = [i["n"] for i in mention] 

    # create a dataframe for storing character information
    character_list = pd.DataFrame(character_list)

    arr = np.array(character_list["gender"])
    if np.all(arr == arr[0]):
      continue # if all major characters are from the same gender, skip this volume as it does not give any comparisons

    # set up basic structure of the dataframe, and put in information about the volume and series
    power_df = character_list.copy()
    power_df[["power1","power2","power3","power4","power5","agent1","agent2","agent3","agent4","agent5","patient1","patient2","patient3","patient4","patient5"]] = 0
    power_df["series_name"] = f"{series}"
    power_df["volume_name"] = f"{volume}"
    power_df["genre"] = str(my_dict[f"{series}"]["genre"])
    
    k = 1
    for i in five_parts:
      inputFile = f"/content/light_novel/{series}/{volume}/{i}"
      outputDir = f"/content/light_novel/{series}/{volume}/output_{i}"
      idd = f"{series}_{volume}_{i}"

      if any(s==f"output_{i}" for s in os.listdir(f"light_novel/{series}/{volume}")) == False:
        booknlp.process(inputFile, outputDir, idd)

      data=proc(f"{outputDir}/{idd}.book")
      power_list = {}
      for character in get_agent_patient(data,mentions):
        score, agent, patient = calculate_power(character)
        power_list[character["name"]] = [score, agent, patient]
      for a in range(9):
        current_name = power_df.iloc[a,0]
        try:
          power_df.loc[a,f"power{k}"] = power_list[current_name][0]
          power_df.loc[a,f"agent{k}"] = str(power_list[current_name][1])
          power_df.loc[a,f"patient{k}"] = str(power_list[current_name][2])
        except:
          power_df.loc[a,f"power{k}"] = "/"
          power_df.loc[a,f"agent{k}"] = "/"
          power_df.loc[a,f"patient{k}"] = "/"
      k = k+1

    print(power_df)
    power_df.to_csv(f'/content/light_novel/{series}/{volume}/{series}_{volume}_power_df_1024.csv', index = False)

In [None]:
# !zip -r /content/light_novel_0701.zip /content/light_novel

In [None]:
# create an empty dataframe to gather the result from all volumes in the corpus
power = pd.DataFrame(columns = ["name","gender","power1","power2","power3","power4","power5",
                                "agent1","agent2","agent3","agent4","agent5",
                                "patient1","patient2","patient3","patient4","patient5","series_name","volume_name","genre"])

In [None]:
for series in series_list:

  volume_list = os.listdir(f"/content/light_novel/{series}")
  volume_list = [i for i in volume_list if i.startswith('Volume')]

  print(f"processing {series}")

  for volume in volume_list:
    print(f"processing {series} {volume}")

    try: 
      mycsv = pd.read_csv(f"/content/light_novel/{series}/{volume}/{series}_{volume}_power_df_new.csv")
      power = pd.concat([power, mycsv], ignore_index = True, axis = 0)
    except:
      continue

In [None]:
power.to_csv('/content/power_normalized_new.csv', index = False)

## get modifier and possessions of the major characters

In [None]:
def get_mod_pos(series,volume,name):
  # input: series name, volume name, name of character
  # output: top 10 poss and top 10 mod
  data = proc(f"/content/light_novel/{series}/{volume}/{series}_{volume}_output/{series}_{volume}.book")

  # check if it is one of the major characters
  for character in data["characters"]:
    if len(character["mentions"]["proper"]) == 0 or character["mentions"]["proper"][0]["n"] != name:
      continue

    possList=character["poss"]
    modList=character["mod"]

    mod_dict = {}
    poss_dict = {}

    printTop = None

    for k, v in get_counter_from_dependency_list(possList).most_common(printTop):
      k = make_lemma(k)
      poss_dict[k] = v
       

    for k, v in get_counter_from_dependency_list(modList).most_common(printTop):
      k = make_lemma(k)
      mod_dict[k] = v

  return poss_dict,mod_dict

In [None]:
# write the modifer and possession information to the dataframe

power[["poss","mod"]] = 0

for i in range(len(power)):
  print(i)
  poss, mod = get_mod_pos(power.loc[i,"series_name"],power.loc[i,"volume_name"],power.loc[i,"name"])
  power.loc[i,"mod"] = str(mod)
  power.loc[i,"poss"] = str(poss)

In [None]:
power.to_csv('/content/output_1025.csv')