# Feature Extraction Code for Chapter 4

<p style='text-align: justify;'> This notebook is used to extract specific passage-level features from Japanese language corpora used in Chapter 4. In contrast to the previous chapter, this code extracts features across discrete-sized chunks of text. This includes all of the passages designated as stream-of-consciousness passages, as well as all passages across two larger corpora, storing the results as data frames for further analysis in R.</p>

In [1]:
##### Import libraries

import numpy as np
from numpy import random

from __future__ import division
import nltk, re, pprint
import pandas as pd
import operator
from pandas import Series, DataFrame
from scipy import stats
from numpy.random import permutation, shuffle
import string

import sys, os
import MeCab  #CHECK "MECABRC" FILE TO SEE WHICH DICTIONARY YOU ARE USING
mecab = MeCab.Tagger("")  #using unidic

### Import word lists for use in feature extraction

In [2]:
#######################
#import stopwords list
#######################
raw = open("VocabLists\jp_stopwords.txt", encoding="utf-8")
text = raw.read()
stopwords = text.split('\n')  #split on newline and turn into a list

##############################
#build onomatopoeia word list
##############################
#grab the tags for these words from JEdict, build a list, and then do a simple check
#grab the xml file, and specify which tag you want
# context = etree.iterparse('c:\Users\Hoyt\Dropbox\SOC_PROJECT_JAPAN\JMdict_e.xml', events=('end',))#, tag='entry')
# #open a text file for writing
# out = open(r'c:\Users\Hoyt\Dropbox\SOC_PROJECT_JAPAN\onom.txt', 'w')
# #iterate through the tree, taking only the elements you need, and deleting everything else along the way
# for event, elem in context:
#     if elem.tag == 'reb':
#         word = elem.text
#     if elem.tag == 'misc' and elem.text[0:4] == 'onom':
#         out.write('%s\n' % word.encode('utf-8'))
#     elem.clear()
#     #while elem.getprevious() is not None:
#     #    del elem.getparent()[0]
# out.close()

#import the onomatopoeia list from already created text file
raw = open(r"VocabLists\onom.txt", encoding="utf-8")
text = raw.read()
onom_list = text.split('\n')  #split on newline and turn into a list
onom_list = list(set(onom_list))
#sort the list for efficient lookup
onom_list.sort()

### Load feature extraction functions

In [3]:
#function to normalize quotation marks
def normalize_quotation(text):
    text = re.sub(r'『', r'「', text)   #replace all 『 with 「
    text = re.sub(r'』', r'」', text)   #replace all 』 with 」
    return text

def bracket_cleaner(raw):
    raw = re.sub(r'［[^］]+］', '', raw)   #replace annotations in brackets ([#...])
    raw = re.sub(r'\s+', ' ', raw)                         #get rid of double spaces
    return raw

#list of punctuations marks to exclude if needed
puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','-','─','<',
         '＃','△','※','＊','〔','〕']

#remove punctuation marks from text with word boundaries indicated
def remove_punc(text):
    for punc in puncs:
        text = re.sub(punc, '', text)
    text = re.sub(r'\s+', ' ', text)    #get rid of double spaces
    return text

#remove stopwords from text with word boundaries indicated; return in same form
def remove_stopwords(text, stopwords):
    tokens = text.split(' ')
    new_list = [token for token in tokens if token not in stopwords]
    return ' '.join(new_list)

#basic type/token ratio using all words in chunk
def tt_ratio(chunk):
    chunk = remove_punc(chunk)   #gets rid of punctuation and numbers
    chunk = chunk.split(' ')
    return len(set(chunk))/len(chunk)   #compute TTR (unique types over all word tokens)

#returns type/token ratio without stopwords
def tt_ratio_no_stopwords(chunk):
    chunk = remove_punc(chunk)
    chunk = remove_stopwords(chunk, stopwords)
    chunk = chunk.split(' ')
    return len(set(chunk))/len(chunk)

#returns type/token ratio without proper nouns or stopwords (as determined by MeCab tags)
#takes a chunk that has already been pos-tagged
def tt_ratio_no_pn(pos_chunk):
    new_text = []
    #extract list of tokens from chunk, excluding proper nouns
    for sent in pos_chunk:    
        #inspect each element
        for item in sent[:-1]:             #exclude last element, which is EOS
            if item[2] != "固有名詞":
                new_text.append(item[0])   #only keep items that are not proper nouns
    new_text = ' '.join(new_text)
    chunk = remove_punc(new_text)
    chunk = remove_stopwords(chunk, stopwords)
    chunk = chunk.split(' ')
    return len(set(chunk))/len(chunk)

#identify onomatopoeia in chunk based on Jedict list
def onom(chunk):
    onom_in_chunk = 0
    total_words = 0
    chunk = remove_punc(chunk)
    chunk = chunk.split(' ')
    for word in chunk:
        if bi_contains(onom_list, word) == True:
            onom_in_chunk += 1
        total_words += 1
    if total_words == 0:    #in case we get a dud sentence
        return 0.0
    else:
        return onom_in_chunk/total_words

#function to return median sentence length of chunk
def median_sent_length(chunk, genre):
    chunk = re.sub(r'\s', '', chunk)
    #chunk_sents = re.findall(r'([^！？。(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)', chunk)
    #chunk_sents = re.findall(r'([^！？。」(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*(」[^をと])*)', chunk)
    chunk_sents = re.findall(r'((「)*[^！？。(――)(——)\(\)]+([^」{5,}?]」[^とを])*(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」\
    と[^。]*)*[！？。」(……)]*)', chunk) 
    if genre != "SOC":
        chunk_sents = chunk_sents[1:-1]    #first and last sentences are likely fragments
    sent_length = []
    for sent in chunk_sents:
        sent_length.append(len(sent[0]))  #just inspect first element in each tokenized sentence
    return np.median(sent_length)

#function to find ellipses in passage
def find_ellip(chunk):
    count = len(re.findall(r"…", chunk))
    if len(chunk) == 0:
        return 0.0
    else:
        return count/len(chunk)    #ratio expressing # of ellipses as function of length of passage

#function to find potential neologisms, here defined as any word appearing in katakana or english
def neo(chunk):
    neologisms_in_chunk = []
    chunk = remove_punc(chunk)
    chunk = chunk.split(' ')
    for word in chunk:
        if len(word) > 2:   #limit to longer words, to avoid things like サ、ホラ、ヶ
            if re.search(r'[゠-ヿＡ-Ｚａ-ｚA-Za-z]', word):
                neologisms_in_chunk.append(word)
    #return ratio of unique katakana and romaji words over all unique words
    #this avoids over-counting character names (esp. in translated texts)
    return len(set(neologisms_in_chunk))/len(set(chunk))

#this function returns pos-tagged chunks as a list of sentence-level triples (the word + first two POS tags) 
def pos_tagger(chunk):
    chunk_sents = []
    #segment the chunk into sentences
    chunk = re.sub(r'\s', '', chunk)
    #sent_list = re.findall(r'([^！？。(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)', chunk)
    #sent_list = re.findall(r'([^！？。」(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*(」[^をと])*)', chunk)
    sent_list = re.findall(r'((「)*[^！？。(――)(——)\(\)]+([^」{5,}?]」[^とを])*(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」\
    と[^。]*)*[！？。」(……)]*)', chunk)
    
    #now pos tag by sentence and store as triples
    for i in range(len(sent_list)):
        sent_triples = []
        sent = sent_list[i][0]  #grab sentence from list
        sent = re.sub(r'[\(\)]', '', sent)      #eliminate parantheses
        sent = re.sub(r'…', '', sent)           #eliminate ellipses
        sent = re.sub(r'[「」〔〕]', '', sent)   #eliminate quotation marks
        node = mecab.parseToNode(sent)          #parse the sentence
        node = node.next
        while node:
            pos_tags = node.feature.split(',')  #turn string of features into a list
            #append the triples for each element of this chunk 
            sent_triples.append([node.surface, pos_tags[0], pos_tags[1]])
            node = node.next
        chunk_sents.append(sent_triples)
    return chunk_sents

#returns the percentage of sentences in chunk that end with noun
def noun_ending(pos_chunk, genre):
    if genre != "SOC":
        pos_chunk = pos_chunk[1:-1]  #eliminate first and last items, since these are likely fragments
    
    #a list of sentence endings that are marked as nouns, but are actually grammatical function words
    exception_list = ["候", "つた", "ところ", "もの", "サ", "ナア", "こと", "事", "時", "とき", "けた", "云々", "だい"]

    noun_endings = 0   #counter for noun_ending sentences
    total_sents = 0    #counter for total sentences
    for sent in pos_chunk:
        if len(sent) > 2:           #don't look at obvious non-sentences
            tag_index = -1          #we want to start from the end of the sentence
            final_tag = sent[tag_index]   
            #while tag is EOS or punctuation..need to look at third item in triple to check for this
            while (final_tag[1] == 'BOS/EOS' or final_tag[2] == '句点') and abs(tag_index) != len(sent):
                tag_index = tag_index - 1                                          #step back one element
                final_tag = sent[tag_index]
            if final_tag[1] == '名詞':                                 #keep track of nouns
                if final_tag[0] not in exception_list:       #make sure word is not an exception
                    noun_endings += 1
            total_sents += 1
    if total_sents == 0:
        return 0.0
    else:
        return noun_endings/total_sents               #normalize 
    
#function to check for verbless sentences    
def verbless_sents(pos_chunk, genre):
    if genre != "SOC":
        pos_chunk = pos_chunk[1:-1]  #eliminate first and last items, since these are likely fragments
    
    verbless_sents = 0
    total_sents = 0
    for sent in pos_chunk:
        if len(sent) > 2:          #don't bother with non-sentences
            verb_counter = 0
            for tag in sent:
                if tag[1] == '動詞':
                    verb_counter += 1
            if verb_counter == 0:
                verbless_sents += 1
            total_sents += 1
    if total_sents == 0:
        return 0.0
    else:
        return verbless_sents/total_sents

#returns ratio of personal pronouns (daimeishi) per sentence, averaged over entire chunk
def per_pronoun_use(pos_chunk, genre):
    if genre != "SOC":
        pos_chunk = pos_chunk[1:-1]  #eliminate first and last items, since these are likely fragments

    sent_ratios = []
    for sent in pos_chunk:
        if len(sent) > 2:
            per_pronouns = 0
            total_tags = 0
            for tag in sent:
                if tag[1] == "代名詞" or tag[0] == "自分":
                    per_pronouns += 1   
                #don't include punctuation in your tag totals
                if tag[1] != "BOS/EOS" and tag[1] != "補助記号":
                    total_tags += 1
            if total_tags != 0:
                sent_ratios.append(per_pronouns/total_tags)       
    if np.isnan(np.mean(sent_ratios)):
        #print("here")
        #print(pos_chunk)
        return 0.0
    elif np.mean(sent_ratios) != 0:
        return np.mean(sent_ratios)          
    else:
        return 0.0

#function that calculates number of sentences per chunk starting with person pronoun (daimeishi)
def per_pronoun_head(pos_chunk, genre):
    if genre != "SOC":
        pos_chunk = pos_chunk[1:-1]  #eliminate first and last items, since these are likely fragments

    prp_sents = 0
    total_sents = 0
    for sent in pos_chunk:
        if len(sent) > 2:
            tag_index = 0
            #while first tag not punct
            while sent[tag_index][1] == "補助記号":
                tag_index += 1   #step forward one element
            total_sents += 1
            if sent[tag_index][1] == "代名詞" or sent[tag_index][0] == "自分":
                prp_sents += 1
    if total_sents == 0:
        return 0.0
    else:
        return prp_sents/total_sents    

#function to identify likely FreeIndirectDiscourse passages, or interior monologue 
#takes a chunk that has already been tokenized (spaces inserted)
def fid_finder(chunk):
    #create a list of all likely FID sentence ending phrases
    fid_phrases = ["ない。","れ。","め。","ろ。","け。","よ。","か。","な。","のだ。","なんだ。","んです。","んだ。","のだった。",
               "何だ。","ところだ。","べきだ。","からだ。","のだから。","ものだ。","のでしょう。","らう。","ろう。","のです。","い。",
               "所だ。","筈だ。","はずだ。","筈です。","はずです。","ことだ。","らしい。","忘れぬ。","にちがいなかった。","わ。",
               "に違いなかった。","ことにした。","らしかった。","ながら。","かしら。","らうか。","ろうか。","気もする。","たら。",
               "てみる。","のやうに。","のように。","かもしれません。","かも知れません。","心地もする。"]

    chunk = re.sub(r'\s', '', chunk)                     #de-tokenize the chunk so we can search for strings later
    chunk = normalize_quotation(chunk)
    chunk = re.sub(r'「[^」]*」', '', chunk)         #eliminate all dialogue passages
    
    #chunk = re.sub(r'[\(\)〔〕]', '', chunk)          #eliminate all parantheses and brackets (usually indicate inter mono)              

    #sent tokenize
    #chunk_sents = re.findall(r'([^！？。(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)', chunk)    
    #chunk_sents = re.findall(r'([^！？。」(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*(」[^をと])*)', chunk)
    chunk_sents = re.findall(r'((「)*[^！？。(――)(——)\(\)]+([^」{5,}?]」[^とを])*(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」\
    と[^。]*)*[！？。」(……)]*)', chunk)
    
    total_sents = len(chunk_sents)
    
    num_fid_sents = 0  #initialize counter

    for sent in chunk_sents[:-1]:   #check all but the last sentence (might be a fragment for non-SOC texts)
        #first check for sentences ending with quotation or exclamation
        if sent[0][-1:] == '?' or sent[0][-1:] == '！':
            num_fid_sents += 1
        #now check for all possible FID strings and adjust counter if any are found
        if any(substring in sent[0] for substring in fid_phrases):
            num_fid_sents +=1
        
    #check last sentence if chunk has content in it
    if len(chunk) != 0:
        if chunk_sents[-1][0][-1:] == '。':   #check last item in sentence
            if any(substring in chunk_sents[-1][0] for substring in fid_phrases):
                num_fid_sents += 1
            else:
                total_sents = total_sents - 1   #last sentence is fragment, so discount from total num of sentences
            
    #calculate and return the ratio of FID sentences vs. total number of non-dialogue sentences
    if total_sents == 0:    #in case we get a dud sentence
        return 0.0
    else:
        return num_fid_sents/total_sents
    
#an efficient way to look up strings in a large list
from bisect import bisect_left
#the list here needs to be sorted
def bi_contains(lst, item):
    """ efficient `item in lst` for sorted lists """
    # if item is larger than the last its not in the list, but the bisect would 
    # find `len(lst)` as the index to insert, so check that first. Else, if the 
    # item is in the list then it has to be at index bisect_left(lst, item)
    return (item <= lst[-1]) and (lst[bisect_left(lst, item)] == item)

### Prepare Data Frames to Analyze all Text Chunks for a given Genre

In [4]:
#Read in corpus information for SOC texts and store as DataFrame
df = pd.read_excel(r'Data\SOC_TEXTS_METADATA.xlsx', sheetname='Sheet1')
df = df.dropna(subset=['WORK_ID'])   #drop any empty rows at end of csv
df['chunk'] = Series('',index=df.index)    #add column for text chunks
df['chunk_id'] = Series('1',index=df.index)    #not needed, but keeping for consistency with realism corpus
#Label SOC as the "0" class -- all other classes will be labeled "1"
df['class_labels'] = Series('0',index=df.index)

#generate filepaths for each text we have and load in the texts
corpus_path = "Texts\\"

for i in df.index:
    filepath = corpus_path + str(df.WORK_ID[i]) + ".txt"  #assign filepath
    text = open(filepath, encoding="utf-8")            
    raw = text.read()
    raw = re.sub(r'[\u3000\ufeff]', '', raw)
    #now tokenize the chunk and store as tokenized text
    node = mecab.parseToNode(raw)
    node = node.next
    tokens = []
    while node:
        tokens.append(node.surface)
        node = node.next
    df.at[i, 'chunk'] = ' '.join([t for t in tokens])   #store tokenized text as tokens joined by space

#drop the columns we don't need
df.drop(['TRANS', 'SOURCE', 'NATIONALITY','PUBLISHER'], axis=1, inplace=True)
df.shape

(120, 9)

In [10]:
df.ix[75].chunk

'可視 的 な もの の 避け 難き 形態 。 それ 以上 で ない に し て も 、 少く とも さう 俺 の 眼 を 通し て 考へ た 。 俺 が 此 處 で 讀む ところ の 總て の もの の 署名 、 海 の 產物 、 海 の 漂流 物 、 近よる 上げ潮 、 あの 錆び た ボオト 。 靑洟 色 、 靑 い 銀色 、 錆 。 彩ら れ た サイン 。 透明 なる もの の 限界 。 然し 彼 は つけ 加へる 、 肉 體 に 於 て 。 で は 彼 は 彩ら れる 前 に それ 等 肉 體 を 知 つ て ゐ た の だ 。 如何に し て ？ 彼 の 頭 を それ 等 に 打付ける こと に よつ て 確か に 。 氣 安く 進む が よい 。 禿頭 で そして 百 萬 長者 で 彼 は あつ た 、 顏 の 識れ た 例 の 先生 。 透明 な もの の 限界 が 來る 。 なぜ 來る か ？ 透明 な 透明 に 。 若し 君 が 五 本 の 指 を 通す こと が 出來る なら 、 それ は 戶 で ない に し て も 、 確か に 門 だ 。 眼 を 閉 ぢ て 、 そして 見ろ 。 ステ イヴン は 彼 の 靴 が 漂流 物 と 貝 殼 を 踏み 碎く 音 を 聞く ため に 眼 を 閉 ぢ た 。 君 は 兎に角 、 それ を 步き とほし て ゐる 俺 は 一 度 に 一 跨ぎ だ 。 空間 の 極く 短い 間 を 通し て の 時間 の 極く 短い 間 を 。 五、六 。 順繰り に 。 正確 に 。 そして それ が 聞える ところ の もの の 避く べから ざる 形態 な の だ 。 眼 を 開け 。 否 。 イエス よ 。 若し 俺 が 海洋 の 上 に 突き出し て ゐる 所 の 斷崖 から 落ちる なら ば 、 不可避 的 に 相 並ん で だ 。 俺 は 闇 の 中 を うまく 進ん で ゆく 。 俺 の アツシユ の 劍 は 俺 の 腰 に 下 つ て ゐる 。 それ で 叩け 。 それ で 宜い 。 靴 を 穿い た 俺 の 二 つ の 足 は 相 並ん で 脚 の 端 に ある 。 創造 の 神 の 槌 に よつ て 造ら れ た 、 この 堅固 な 響 。 俺 は サンデイ マウント の 濱 に 沿 ふて 永遠

In [6]:
#####################################################################
#create dataframe for JUNBUNGAKU, INOVEL, and POPULAR corpora chunks
#####################################################################

temp_df = pd.read_excel(r'Data\ALL_TEXTS_METADATA.xlsx', sheetname='Sheet1')
temp_df = temp_df.dropna(subset=['WORK_ID'])   #drop any empty rows at end of spreadsheet
#temp_df['num_chunks'] = Series('',index=temp_df.index)   #add column to store number of text chunks

temp_df.shape

#set your chunk_length (in characters)
chunk_length = 1500

#path where all files are kept
corpus_path = r"C:\Users\Hoyt\Dropbox\JapanCorpusTokenized\\"

k = 0
    
#iterate through and build a df for each text that contains all of the text chunks
for i in temp_df.index:
    #need to read in file, get number of chunks, and store that information first
    filepath = corpus_path + str(temp_df.WORK_ID[i]) + ".txt"
    raw = open(filepath, encoding="utf-8")
    text = raw.read()         
    text = re.sub(r'[\u3000\ufeff]', '', text)
    text = normalize_quotation(text)
    text = bracket_cleaner(text)
    text = re.sub(r'」 「', '」 。 「', text)  #insert period for back-to-back quotes to help with sent tokenization
    num_chunks = int(len(text)/chunk_length)   #calculate the number of chunks
    text = text[:(chunk_length*num_chunks)]     #get rid of trailing text at the end
        
    #make lists for metadata items
    work_id = [temp_df.WORK_ID[i]]*num_chunks  
    title = [temp_df.TITLE[i]]*num_chunks
    auth_last = [temp_df.AUTH_LAST[i]]*num_chunks
    auth_first = [temp_df.AUTH_FIRST[i]]*num_chunks
    publ_date = [temp_df.PUBL_DATE[i]]*num_chunks
    genre = [temp_df.GENRE[i]]*num_chunks
    chunk_ids = range(num_chunks)
    class_labels = [1]*num_chunks
    chunk_list = []       #initiate a master list to store novel chunks
        
    #slice and dice and tokenize!
    for j in range(num_chunks):
        text_chunk = text[(j*chunk_length):((j+1)*chunk_length)]
        #now tokenize the chunk and store as tokenized text
        node = mecab.parseToNode(text_chunk)
        node = node.next
        tokens = []
        while node:
            tokens.append(node.surface)
            node = node.next
        tokenized_chunk = ' '.join([t for t in tokens])
        chunk_list.append(tokenized_chunk)         #add to master list
        
    #now put all these lists into a dictionary and create a data frame
    data_f = {'WORK_ID': work_id, 'TITLE': title, 'AUTH_LAST': auth_last, 'AUTH_FIRST': auth_first,
            'PUBL_DATE': publ_date, 'GENRE': genre, 'chunk': chunk_list, 'chunk_id': chunk_ids, 'class_labels': class_labels}
    chunked_df = pd.DataFrame(data_f, columns=data_f.keys())
        
    #if first time through loop, then initialize a master data frame, else merge with the master
    if k == 0:
        chunked_texts = chunked_df
    else:
        chunked_texts = pd.concat([chunked_texts, chunked_df], ignore_index=True)
    k+=1    

#clean any empty or null chunks
mask = chunked_texts["chunk"].isin([''])
chunked_texts = chunked_texts[~mask]

mask = chunked_texts["chunk"].isnull()
chunked_texts = chunked_texts[~mask]

#merge these chunked texts with the SOC chunks
all_df = pd.concat([df, chunked_texts], ignore_index=True)
all_df.shape

(7497, 9)

In [4]:
##############################################
#create dataframe for 1925-1940 Fiction corpus
##############################################

fic_df = pd.read_excel(r'Data\ALL_FIC_METADATA.xlsx', sheetname='Sheet1')
fic_df = fic_df.dropna(subset=['WORK_ID'])   #drop any empty rows at end of spreadsheet
#temp_df['num_chunks'] = Series('',index=temp_df.index)   #add column to store number of text chunks

fic_df.shape

#set your chunk_length (in characters)
chunk_length = 1500

#path where all files are kept
corpus_path = r"C:\Users\Hoyt\Dropbox\JapanCorpusTokenized\\"

k = 0
    
#iterate through and build a df for each text that contains all of the text chunks
for i in fic_df.index:
    #need to read in file, get number of chunks, and store that information first
    filepath = corpus_path + str(fic_df.WORK_ID[i]) + ".txt"
    raw = open(filepath, encoding="utf-8")
    text = raw.read()         
    text = re.sub(r'[\u3000\ufeff]', '', text)
    text = normalize_quotation(text)
    text = bracket_cleaner(text)
    text = re.sub(r'」 「', '」 。 「', text)  #insert period for back-to-back quotes to help with sent tokenization
    num_chunks = int(len(text)/chunk_length)   #calculate the number of chunks
    text = text[:(chunk_length*num_chunks)]     #get rid of trailing text at the end
        
    #make lists for metadata items
    work_id = [fic_df.WORK_ID[i]]*num_chunks  
    title = [fic_df.TITLE[i]]*num_chunks
    auth_last = [fic_df.AUTH_LAST[i]]*num_chunks
    auth_first = [fic_df.AUTH_FIRST[i]]*num_chunks
    publ_date = [fic_df.PUBL_DATE[i]]*num_chunks
    genre = [fic_df.GENRE[i]]*num_chunks
    chunk_ids = range(num_chunks)
    class_labels = [1]*num_chunks
    chunk_list = []       #initiate a master list to store novel chunks
        
    #slice and dice and tokenize!
    for j in range(num_chunks):
        text_chunk = text[(j*chunk_length):((j+1)*chunk_length)]
        #now tokenize the chunk and store as tokenized text
        node = mecab.parseToNode(text_chunk)
        node = node.next
        tokens = []
        while node:
            tokens.append(node.surface)
            node = node.next
        tokenized_chunk = ' '.join([t for t in tokens])
        chunk_list.append(tokenized_chunk)         #add to master list
        
    #now put all these lists into a dictionary and create a data frame
    data_f = {'WORK_ID': work_id, 'TITLE': title, 'AUTH_LAST': auth_last, 'AUTH_FIRST': auth_first,
            'PUBL_DATE': publ_date, 'GENRE': genre, 'chunk': chunk_list, 'chunk_id': chunk_ids, 'class_labels': class_labels}
    chunked_df = pd.DataFrame(data_f, columns=data_f.keys())
        
    #if first time through loop, then initialize a master data frame, else merge with the master
    if k == 0:
        chunked_texts = chunked_df
    else:
        chunked_texts = pd.concat([chunked_texts, chunked_df], ignore_index=True)
    k+=1    

#clean any empty or null chunks
mask = chunked_texts["chunk"].isin([''])
chunked_texts = chunked_texts[~mask]

mask = chunked_texts["chunk"].isnull()
chunked_texts = chunked_texts[~mask]

#merge these chunked texts with the SOC chunks
all_df = chunked_texts
all_df.shape

(32626, 9)

### Extract features from all chunks

In [None]:
#iterate through each chunk in the dataframe and calculate values for all features

#Add columns for each feature to be extracted
all_df['sent_length'] = Series('',index=all_df.index)
all_df['noun_ending'] = Series('',index=all_df.index)
all_df['verbless_sents'] = Series('',index=all_df.index)  
all_df['per_pronoun_use'] = Series('',index=all_df.index)
all_df['per_pronoun_head'] = Series('',index=all_df.index)
all_df['tt_ratio'] = Series('',index=all_df.index)
all_df['tt_ratio_no_stopwords'] = Series('',index=all_df.index)
all_df['tt_ratio_no_pn'] = Series('',index=all_df.index)
all_df['onomatopoeia'] = Series('',index=all_df.index)
all_df['neologisms'] = Series('',index=all_df.index)
all_df['ellipses'] = Series('',index=all_df.index)
all_df['fid_ratio'] = Series('',index=all_df.index)

#Iterate through all texts and extract features
for k in all_df.index:
    #create a POS tagged version of the chunk
    pos_chunk = pos_tagger(all_df.chunk[k])
    
    #pass each chunk through all the feature extraction functions
    all_df.at[k, 'sent_length'] = median_sent_length(all_df.chunk[k], all_df.GENRE[k])
    all_df.at[k, 'noun_ending'] = noun_ending(pos_chunk, all_df.GENRE[k])
    all_df.at[k, 'verbless_sents'] = verbless_sents(pos_chunk, all_df.GENRE[k])
    all_df.at[k, 'per_pronoun_use'] = per_pronoun_use(pos_chunk, all_df.GENRE[k])
    all_df.at[k, 'per_pronoun_head'] = per_pronoun_head(pos_chunk, all_df.GENRE[k])
    all_df.at[k, 'tt_ratio'] = tt_ratio(all_df.chunk[k])
    all_df.at[k, 'tt_ratio_no_stopwords'] = tt_ratio_no_stopwords(all_df.chunk[k])
    all_df.at[k, 'tt_ratio_no_pn'] = tt_ratio_no_pn(pos_chunk)
    all_df.at[k, 'onomatopoeia'] = onom(all_df.chunk[k])
    all_df.at[k, 'neologisms'] = neo(all_df.chunk[k])
    all_df.at[k, 'ellipses'] = find_ellip(all_df.chunk[k])
    all_df.at[k, 'fid_ratio'] = fid_finder(all_df.chunk[k])

    print("Processed " + str(k+1) + " of " + str(len(all_df)) + " chunks...", end="\r")

all_df.shape

In [None]:
#r'((「)*[^！？。(――)(——)\(\)]+([^」{5,}?]」[^とを])*(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)'
#this does a better job than what you had, but still can't handle quote directly followed by another quote (e.g., 1822)

#((「)*[^！？。」(――)(——)\(\)]+([^」{5,}?]」[^とを])*(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)
#this works for 1822--need to test on others

#the solution to this is to replace every instance of back to back quotes with quote, period, quote -- this will make
#the sent_tokenizer far more accurate on the whole, and doesn't impact downstream measurements

#we should also calculate median sentence length after removing dialogue

In [39]:
#chunked_texts.loc[(chunked_texts['TITLE'] == 'Chijin no ai') & (chunked_texts['chunk_id'] == 160)]
re.sub(r'\s', '', chunked_texts.ix[3214].chunk)

'」。「いや、まあお聞きなさい、別れる時に熊谷が少し気の毒になったんで、「今夜は何処へ泊るんだい」ッてそう云うと、「泊る所なんか幾らもあるわよ。あたしこれから横浜へ行くわ」ッて、ちっともショゲてなんかいないで、そのままスタスタ新橋の方へ行くんだそうです。―――」。「横浜と云うのは、誰の所なんです？」。「そいつが奇妙なんですよ、いくらナオミさんが顔が広いッて、横浜なんかに泊る所はないだろうから、ああ云いながら多分大森へ帰ったんだろうと、そう熊谷が思っていると、明くる日の夕方電話が懸って、「エルドラドオで待っているから直ぐ来ないか」と云う訳なんです。それで行って見ると、ナオミさんが目の覚めるような夜会服を着て、孔雀の羽根の扇を持って、頸飾りだの腕環だのをギラギラさせて、西洋人だのいろんな男に囲まれながら、盛んにはしゃいでいるんだそうです」浜田の話を聞いているとあたかもビックリ箱のようで、「おやッ」と思うような事実がピョンピョン跳び出して来るのです。つまりナオミは、最初の晩は西洋人の所へ泊ったらしいのですが、その西洋人はウィリアム・マッカネルとか云う名前で、いつぞや私が始めてナオミとエルドラドオへダンスに行った時、紹介もなしに傍へ寄って来て、無理に彼女と一緒に踊った、あのずうずうしい、お白粉を塗った、にやけた男がそれだったのです。ところが更に驚くことには、―――これは熊谷の観察ですが、―――ナオミはあの晩泊りに行くまで、そのマッカネルと云う男とは何もそれほど懇意な仲ではなかったのだと云うのです。尤もナオミも、前から内々あの男に思し召しがあったらしい。何しろちょっと女好きのする顔だちで、すっきりとした、役者のような所があって、ダンス仲間で「色魔の西洋人」と云う噂があったばかりでなく、ナオミ自身も、「あの西洋人は横顔がいいわね、何処かジョン・バリに似てるじゃないの」―――ジョン・バリと云うのは亜米利加の俳優で、活動写真でお馴染のジョン・バリモーアのことなのです。―――と、そう云っていたくらいだから、確かにあれに眼を着けていたのだ。或はちょいちょい色眼ぐらいは使ったことがあるかも知れない。それでマッカネルの方でも、「此'

In [12]:
#export dataframe to excel file for analysis in R
import xlsxwriter
import openpyxl
all_df.drop(['chunk'], axis=1, inplace=True)
writer = pd.ExcelWriter(r'Results\AllChunkFeatures.xlsx', engine='xlsxwriter')
all_df.to_excel(writer, sheet_name='Sheet1')
writer.save()

### Collect and Output 1929 "SOC" Passages

In [28]:
#read in passage metadata
soc_1929_df = pd.read_excel(r'Results\SOC_CHUNKS_1929.xlsx', sheetname='Sheet1')
soc_1929_df = soc_1929_df.sort_values(['TITLE', 'chunk_id'], ascending=[True, True])

#open file for output
f = open("1929_SOC_Passages.txt", "w", encoding="utf-8")

#grab chunks from master data frame and write to file
for i in soc_1929_df.index:
    title = soc_1929_df.ix[i].TITLE
    chunk_id = soc_1929_df.ix[i].chunk_id
    score = soc_1929_df.ix[i].score
    auth_last = soc_1929_df.ix[i].AUTH_LAST
    auth_first = soc_1929_df.ix[i].AUTH_FIRST
    
    #get the passage
    chunk = all_df.loc[(all_df['TITLE'] == title) & (all_df['chunk_id'] == chunk_id)].chunk.iloc[0]
    chunk = re.sub(r'\s','',chunk)
    
    f.write(title + '(id= ' + str(chunk_id) + '), ' + auth_last + ' ' + auth_first + ': ' + str(score) + '\n\n')
    f.write(chunk + '\n\n\n')

f.close()