## Code for Running Analysis on Aozora Fiction Corpus

In [1]:
#### import libraries

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import stats
import sys
import re
import MeCab
#mecab = MeCab.Tagger('-Ochasen')
mecab = MeCab.Tagger("")  #using unidic
import collections
import operator
import nltk

In [2]:
#########################################
# Functions used for extracting features
#########################################

#this first one should only be used for non-tokenized texts; basically cleans them for tokenization step
def strip_chap_titles(raw):
    #get rid of chapter titles that use Chinese numbers with or without surronding parantheses
    raw = re.sub(r'（*([一二三四五六七八九十])+(）)*\n', '', raw)
    
    #get rid of chapter titles that use utf-8 alpha-numeric numbers
    raw = re.sub(r'[１-９]+\n', '', raw)
    raw = re.sub(r'[第弐拾章参壱一二三四五六七八九十]+\n', '', raw)
    
    #normalize all quotation marks to singl bracket
    raw = re.sub(r'『', r'「', raw)   #replace all 『 with 「
    raw = re.sub(r'』', r'」', raw)   #replace all 』 with 」
    
    #remove newlines and spaces
    raw = re.sub(r'\n', '', raw)  #strips all newlines
    raw = re.sub(r'\r', '', raw)  #strips all returns
    raw = re.sub(r'\s', '', raw)  #strip spaces if text is not already tokenized
    
    raw = re.sub(u'\ufeff', '', raw)
    return raw

puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','<','〔','〕',
         '＃','△','※','＊']

def remove_punc(text):
    for punc in puncs:
        text = re.sub(punc, '', text)
    text = re.sub(r'\s+', ' ', text)                         #get rid of double spaces
    return text

def remove_stopwords(tokens, stopwords):
    new_list = [token for token in tokens if token not in stopwords]
    return new_list

def get_stopwords(path):
    f = open(path, encoding='utf-8')
    words = f.read()
    return re.split(r'\n', words)

def bracket_cleaner(raw):
    raw = re.sub(r'［[^］]+］', '', raw)   #replace annotations in brackets ([#...])
    raw = re.sub(r'\s+', ' ', raw)                         #get rid of double spaces
    return raw

def count_kanji(raw):
    kanji_count = len(re.findall(r'[\u4E00-\u9FEF]', raw))
    not_kanji_count = len(re.findall(r'[^\u4E00-\u9FEF]', raw))
    return kanji_count, not_kanji_count

def count_punc(raw):
    return len(re.findall(r'[、。！？]', raw))

def count_tags(raw):
    nouns = 0
    verbs = 0
    adj = 0
    tokens = 0

    adj_tags = ['形容詞','副詞','連体詞']
    puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','<','〔','〕',
         '＃','△','※','＊']
    
    raw = re.sub(r' ', '', raw)

    node = mecab.parseToNode(raw)
    node = node.next

    while node:
        head_tag = re.split(r',', node.feature)[0]
        if head_tag == "名詞":
            nouns += 1
        elif head_tag == "動詞":
            verbs += 1
        elif head_tag in adj_tags:
            adj += 1

        #exclude punctuation  from token count    
        if head_tag not in puncs:
            tokens += 1
        
        #go to next item
        node = node.next
    
    return nouns, verbs, adj, tokens

## Load Metadata and Select Texts to Analyze

In [4]:
# read in metadata spreadsheet for the corpus you are working with
df = pd.read_excel(r'C:\Users\Hoyt\Dropbox\CodeDataForBook\Chapter2\Data\Corpus_Metadata_Clean.xlsx', sheet_name='metadata')

#select only the Aozora texts in your fiction corpus
df = df[df['FICTION_CORPUS'] == True]
df = df[df['SOURCE'] == 'aozora']
df = df.reset_index(drop=True)
df.shape

(1829, 33)

In [5]:
#add new columns to store collected features
df['KANJI_COUNT'] = Series('',index=df.index)
df['NON_KANJI_COUNT'] = Series('',index=df.index)
df['PUNCT'] = Series('',index=df.index)
df['NOUNS'] = Series('',index=df.index)
df['VERBS'] = Series('',index=df.index)
df['ADJS'] = Series('',index=df.index)
df['N_RATIO'] = Series('',index=df.index)
df['MVR'] = Series('',index=df.index)

#point to where all tokenized texts are stored
CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\JapanCorpusTokenized\\"

for k in df.index:
    source_text = CORPUS_PATH + str(df.WORK_ID[k]) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    raw = raw_text.read()
    
    raw = bracket_cleaner(raw)
    
    df.PUNCT.at[k] = count_punc(raw)
    
    #exclude punctuation before counting Kanji
    no_punc_raw = remove_punc(raw)    
    df.KANJI_COUNT.at[k], df.NON_KANJI_COUNT.at[k] = count_kanji(no_punc_raw)
    
    #counts POS tags on raw text
    nouns, verbs, adj, tokens = count_tags(raw)
    
    df.NOUNS.at[k] = nouns
    df.VERBS.at[k] = verbs
    df.ADJS.at[k] = adj
    df.N_RATIO.at[k] = 100 * (nouns/tokens)
    df.MVR.at[k] = 100 * (adj/verbs)
    
    print(str(k), end="\r", flush=True)

1830

In [7]:
#export results to an excel file
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\JAPAN_CORPUS\Counts.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()