## Character Annotation Code for Chapter 5

<p style='text-align: justify;'> This notebook contains some simple scripts that I used to identify character names and replace each with a unique identifier for subsequent analysis.</p>

### Import Libraries

In [2]:
#### imports

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import stats
import os
import sys
import re
import MeCab  #CHECK "MECABRC" FILE TO SEE WHICH DICTIONARY YOU ARE USING
mecab = MeCab.Tagger("")  #using unidic
import collections
import operator
import nltk
import math

### Load Fiction Metadata

In [4]:
#LOAD in metadata for the fiction corpus
folder_path = os.path.abspath("CharacterAnnotationCode" + "/../../")
fic_meta = folder_path + "\Data\Fiction_Meta.xlsx"
df = pd.read_excel(fic_meta, sheet_name='Sheet1')

#select works where you want to annotate characters
df = df[df['FILTER'] == 'char_annot']
df.shape

(63, 38)

### Identify Character Names in Texts using Frequency Tables

In [5]:
#get list of file_ids and select a work to inspect in next cell
file_ids = df.WORK_ID.tolist()
print(file_ids)

[43622, 45269, 1743, 10000224, 46576, 10000230, 1101, 45061, 159, 33197, 1875, 1320, 4512, 46636, 2246, 10000231, 46488, 10000202, 10000227, 50899, 2100, 10000213, 2012, 52236, 10000232, 1502, 10000201, 10000200, 156, 10000203, 45485, 3585, 10000214, 1418, 10000225, 2117, 10000212, 10000205, 10000204, 3370, 3527, 10000215, 1398, 10000216, 10000217, 2929, 10000218, 10000206, 10000226, 10000207, 10000228, 3369, 10000220, 10000209, 10000211, 10000229, 10000208, 10000222, 10000221, 10000219, 10000223, 10000210, 2277]


In [None]:
#set path where tokenized files are stored
CORPUS_PATH = folder_path + "Corpora\AozoraFictionTokenized\\"

#grab a single text from list in previous cell
text_id = file_ids[0]  #analyzing the first text in list
source = CORPUS_PATH + str(text_id) + '.txt'

#create a global list to hold the tokens
tokens = []

#read in text and do some pre-processing
raw = open(source, encoding="utf-8", errors="ignore")       #grab text
text = raw.read()

#split the text into a list of individual tokens
tokens = text.split(' ')
while '' in tokens: tokens.remove('')  #remove blank spaces
    
total_tokens = len(tokens)  #keep track of total tokens to calculate rel freq later

#produce the frequency list
fdist = nltk.FreqDist(tokens)
freq_pairs = fdist.items()
sort_freq_pairs = sorted(freq_pairs, key=lambda x:x[1], reverse=True)  #sort by decreasing frequency

#create a dictionary to store word-frequency pairs
word_freqs = {}

#fill dictionary with pairs
for item in sort_freq_pairs:
    word_freqs[item[0]] = item[1]
    
freqs_df = DataFrame.from_dict(word_freqs, orient='index')  #convert dict to dataframe
freqs_df = freqs_df.rename(columns={0:'frequency'})    #rename column
freqs_df = freqs_df.sort_values(by='frequency', ascending=False)   #sort by frequency

#build stop-words list
stoplist_dir = folder_path + '\WordLists\\'
text = open(stoplist_dir + "stopwords.txt", encoding='utf-8')
raw = text.read()
#need to turn stopwords into a list
raw = raw.split('\n')  #need to split on return carriage and newline
jp_stopwords = []
for word in raw:
    jp_stopwords.append(word)

#add punctuation
puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','<','〔','〕',
         '＃','△','※','＊','—','(',')','.','．']
jp_stopwords = jp_stopwords + puncs

#eliminate stopwords from top of list
drop_list = []   
for i in range(500):
    if freqs_df.iloc[i].name in jp_stopwords:
        drop_list.append(i)   #store index number in a list
freqs_df = freqs_df.drop(freqs_df.index[drop_list])  #drop all the stopwords using list of index numbers

#compute relative frequencies
freqs_df['rel_freq'] = freqs_df.frequency / total_tokens
freqs_df = freqs_df.reset_index()

#inspect top 50 hits to extract most common character names in text
freqs_df[0:50]

### Annotate Character Names

In [21]:
# create dictionary that identifies major and minor characters in each text to be annotated and assigns unique ID
# these characters were hand-identified using previous cell
# character names are as they appear in UNIDIC tokenized text

char_names = {'10000224':['コーレ','ピラ'], 
              '10000219':['李 艶','奎 栄','鳳 琴','潤 芝','董 翠 花','チュルガン','金 毓桂','慶 亭'],
              '10000215':['申 重 欽'],
              '10000210':['寿 善','北原'],
              '10000202':['栄 策'],
              '10000230':['伊東','柏年'],
              '10000203':[],
              '10000232':['碧雲'],
              '10000200':['お久 さん'],
              '10000214':['張'],
              '10000211':['八 吉'],
              '10000220':['清吉','健','シ ノブ','しづ子','黒井'],
              '10000231':['秀 梅','遠矢','細君'],
              '10000205':['金 、 太郎','母親','父親','乾 爺 さん'],
              '10000212':['姉','露助','ジナイーダ','ズナ'],
              '10000229':['周長 乾 老人','周長 坤','弟','父親','叔父','兄'],
              '10000221':['どん げん'],
              '10000227':['順','雪子','加代','老母'],
              '10000207':['朴','朴 泰 民'],
              '10000228':['劉 石 虎','楊 名声'],
              '10000225':['田中','母','父'],
              '10000213':['フユ','父親','母親','フデ'],
              '10000206':['先生','坊主','本多'],
              '10000222':['マリ ヤン','H 氏'],
              '10000223':['ノー カナ','コック 長','ボーイ'],
              '10000201':['操','文吉'],
              '10000204':['童 伊','許 生 員','蓬 坪'],
              '10000216':['張','李','リベカ'],
              '10000217':['王','丸 焼'],
              '10000218':['祝','真吉','吉村','祝 廉 天'],
              '10000226':['采 雲','楊','母'],
              '10000208':['先生','秀 東'],
              '10000209':['エップニ','姉'],
              '45269':['ラシイヌ','レザール 氏','ダン チョン 画家','張 教仁','紅玉'],
              '43622':['ジョン','紋 太夫','ホーキン 氏'],
              '3370':['怪 塔 王','兵曹 長','小浜','塩田 大尉','青江 三 空曹','大利根 博士'],
              '3527':['川上','杉田','リット 提督'],
              '46576':['清','フー ラー','武田 博士'],
              '2117':['呉羽','轟 氏'],
              '2100':['バード ・ ストーン'],
              '33197':[],
              '45485':['叔父','源兵衛'],
              '3369':['竹見','ハルク','ノルマン','ポー ニン','モロ'],
              '1320':['マヌエラ','カーク','ヤン'], #think these are gorillas, actually
              '50899':['参 木','甲谷','お 杉','宮子','山口','オルガ','お 柳'],
              '1502':['丑松','銀之助','志保','瀬川','蓮太郎'],
              '2246':['矢代','千鶴子','久慈','真紀子','東野'],
              '1418':['山崎','幹太郎','中津','高取','小山'],
              '2012':['伸子','素子','蜂谷'],
              '1743':['マターファ','スティヴンスン','ファニイ'],
              '3585':['光一','手塚','チビ 公','千三','阪井','巌','文子'],
              '2929':['玄竜','田中','大村'],
              '1398':['春雄','山田','李'],
              '2277':['周 さん','津田','藤野'],
              '52236':['ゆき子','富岡','加野'],
              '46488':['富士 男','ドノバン','ゴルドン','次郎','イバン ス','サービス','バクスター','モ コウ'],
              '156':['穂積 中佐','将軍','田口 一 等 卒'],
              '1101':['俊寛','成経','有王','康頼'],
              '45061':['ガルーダ を ぢ さん','首領']}
              '159':['俊寛',' 女 '], 
              '1875':['阿賀 妻'],
              '4512':['駒井','米 友','神尾','お松','岩倉 三 位'],
              '46636':['キューネ','ハチロウ','ナエーア']}

#assign unique ids to all annotated character names
all_char_ids = {}
for key in char_names.keys():
    i = 1
    char_ids = []
    for char_name in char_names[key]:
        char_id = '0' + key + '0000' + str(i)
        char_ids.append((char_name, char_id))
        i += 1
    all_char_ids[key] = char_ids

### Create New Versions of Texts with Character Names replaced by Unique IDs

<p style='text-align: justify;'>The resulting texts will be in lemmatized form to allow searching of significant semantic clusters using words generated from Cluster Detection analysis.</p>

In [28]:
#1. Import unidic text
#2. Replace character names with the unique ID for that character
#3. Strip spaces and then re-tokenize into lemma form
#4. Replace character IDs with name-title combo and output to new directory

#work on unidic tokenized corpus
CORPUS_PATH = folder_path + "\Corpora\AozoraFictionTokenized\\"
OUTPUT_PATH = folder_path + "\Corpora\CharAnnot\\"

#iterate through all character annotated texts and substitute character names with ids
for k in df.index:
    #get the tokenized text
    source_text = CORPUS_PATH + str(int(df.WORK_ID[k])) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    raw = raw_text.read()
    
    #use work_id to access the char_ids dictionary
    char_ids = all_char_ids[str(df.WORK_ID[k])]
    
    #replace character names with unique character id
    for pair in char_ids:
        raw = re.sub(pair[0], pair[1], raw)
        
    #strip spaces and re-tokenize into lemma form
    raw = re.sub(r'\s', '', raw)
    
    lemma_tokens = []
    node = mecab.parseToNode(raw)
    node = node.next

    while node:
        if len(re.split(r',', node.feature)) > 6:  #some words don't have a lemma form
            lemma_tokens.append(re.split(r',', node.feature)[7])
            node = node.next
        else:   #if not, just add the plain token
            lemma_tokens.append(node.surface)
            node = node.next

    #merge lemma tokens
    new_text = ' '.join(lemma_tokens)
    
    #replace character ids with an easier to interpret tag
    for pair in char_ids:
        new_id = pair[0].replace(" ", "") + '_' + df.WORK_TITLE[k].replace(" ", "")
        new_text = re.sub(pair[1], new_id, new_text)
        
    #now print the revised text back out to a file
    with open(OUTPUT_PATH + str(df.WORK_ID[k]) + ".txt", "w", encoding="utf-8") as f:
        f.write(new_text)
        f.close()