# Feature Extraction Code for Chapter 3

<p style='text-align: justify;'> This notebook is used to extract specific features from Japanese language corpora used in Chapter 3. These include things like the number of pronouns, number of connective words, etc. It also includes basic code for extracting the most distinctive words between two corpora.</p>

### Import Python Libraries

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import stats
import sys, operator, os, re
from shutil import copyfile
import collections
from collections import Counter
from scipy.stats import mannwhitneyu
import MeCab  #CHECK "MECABRC" FILE TO SEE WHICH DICTIONARY YOU ARE USING
mecab = MeCab.Tagger("")  #using unidic

### Load All Feature Extraction Functions

In [2]:
#get tokens from a corpus
def get_tokens(dataframe, path):
    all_tokens = []
    for k in dataframe.index:
        #get the tokenized text
        source_text = path + str(dataframe.WORK_ID[k]) + ".txt"
        raw_text = open(source_text, encoding="utf-8")       #grab text
        tokenized = raw_text.read()
        tokenized = normalize_quotation(tokenized)
        text_tokens = re.split(r'\s', tokenized)    #split on white space
        all_tokens.extend(text_tokens)              #add tokens to cumulative list
    return all_tokens

#function to normalize quotation marks
def normalize_quotation(text):
    text = re.sub(r'『', r'「', text)   #replace all 『 with 「
    text = re.sub(r'』', r'」', text)   #replace all 』 with 」
    return text

#list of punctuations marks to exclude if needed
puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','-','─','<',
         '＃','△','※','＊']

#function to clean punctuation marks from text
def remove_punc(text):
    for punc in puncs:
        text = re.sub(punc, '', text)
    text = re.sub(r'\s+', ' ', text)    #get rid of double spaces
    return text

#function to compute percentage of text made up of dialogue
def percent_dialogue(text):
    no_quotes = re.sub(r'「[^」]*」', '', text)   #eliminate all dialogue passages
    text_per_diag = (len(text)-len(no_quotes))/len(text)
    return text_per_diag

#this function calculates the proportion of non-dialogue sentences that contain a first/third person singluar pronoun
#must feed it a text that has been tokenized
def pronouns(text):
    terms = ['私','自分','僕','俺','わたくし','あたし','わたし','己','吾','余','おれ','わし','我輩','吾輩','我','わが',
             'ぼく','予','彼','彼女','かれ','彼奴','彼れ']

    no_dia = re.sub(r'「[^」]*」', '', text)

    #eliminate bigrams that contain 彼 but which are not of interest
    no_dia = re.sub(r'彼\s[ら等方]','',no_dia)
    
    sents = re.findall(r'([^！？。(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)', no_dia)

    counter = 0
    
    #get intersection of sentence tokens and the pronoun terms; any intersection means a pronoun is found
    for sentence in sents:
        sent_tokens = re.split(r'\s', sentence[0])
        if len(list(set(sent_tokens) & set(terms))) > 0:
            counter += 1
        
    return(counter/len(sents))
    
#this function calculates the proportion of thought/feeling words in all lemma of non-dialogue part of text; 
#pass to it a text with spaces removed
def thought(text):
    terms = ['感ずる','考える','心持ち','気分','心配','気持ち','思う']  #define lemma to search for
    text = re.sub(r' ','',text) #detokenize
    
    #strip dialogue
    no_dialogue = re.sub(r'「[^」]*」', '', text)

    lemma_count = 0
    thought_count = 0

    #tokenize and check lemma forms
    node = mecab.parseToNode(text)
    node = node.next
    tokens = []
    while node:
        if len(re.split(r',', node.feature)) > 6:  #some words don't have a lemma form
            lemma = re.split(r',', node.feature)[7]
            if lemma in terms:
                thought_count += 1
            lemma_count += 1
            node = node.next
        else:   #if not, just count the plain token without checking content
            lemma_count += 1
            node = node.next

    return(thought_count/lemma_count)

#this function calculates proportion of non-dialogue sentences that begin with one of the conjunctions or connecting words
#from Kisaka; pass to it a text with spaces removed
def conjuncts(text):
    """
    Finds ratio of coordinating conjunctions （接続詞語彙） in prose portions based on list by Kisaka;
    """
    #items are ordered such that longer overlapping sequences are searched for first
    terms = ['さうかと云って','さうしてまた','さうして','さうしたら','さうなると','さればとて','されば','さて','しかし','併し',
             '然らば','然しながら','然し','しかも尚','しかも','然るに','すると','するうち','それでなければ','それにもかかわらず',
             'それにも拘らず','それにしろ','それにまた','それにしても','それに','それから又','それだから','それから','それゆゑ',
             'それ故','それとても','それとも','それでは','それでも','それで','それだのに','それは','それが又','それが','それより',
             'そうかと云って','そうしたら','そうなると','そうして','そうすると','其の代り','そして又','そして','而して','そこで',
             '従って','それなら','であるのに','だから','ですから','即ち','つまりは','つまり','要するに','けれども','けれど','だが',
             'ところが','所で','所が','唯','但し','尤も','もっとも','又','然も','その上','且つ','更には','或は','何故と云へば',
             'これから','かうして','斯うして','こうして','猶且','では','ただ','また','と又','又一方'] #'と','で','が']

    no_dia = re.sub(r'「[^」]*」', '', text)
    sents = re.findall(r'([^！？。(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)', no_dia)

    counter = 0

    for sentence in sents:
        for word in terms:
            if re.match(word, sentence[0]):
                counter += 1
            continue  #since we've found a match, skip to next sentence            
    
    return(counter/len(sents))
    
#function to calculate Yule's K, Yule's I, and Guiraud's C; text should be tokenized
def get_metrics(s):
    """ 
    Returns a tuple with Yule's K and Yule's I.
    (cf. Oakes, M.P. 1998. Statistics for Corpus Linguistics.
    International Journal of Applied Linguistics, Vol 10 Issue 2)
    In production this needs exception handling.
    """
    tokens = s.split(' ')
    for token in reversed(tokens):  #remove blank spaces
        if token == '':
            tokens.remove(token)
    token_counter = collections.Counter(tokens)
    
    #calculate Yule's metrics
    m1 = sum(token_counter.values())
    m2 = sum([freq ** 2 for freq in token_counter.values()])
    i = (m1*m1) / (m2-m1)
    k = 1/i * 10000
    
    #calculate Guiraud's lexical concentration metric w/ only words tagged as noun, adjective, verb
    noun_tokens = []
    node = mecab.parseToNode(tokenized)
    node = node.next

    while node:
        head_tag = re.split(r',', node.feature)[0]
        if head_tag == "名詞":
            noun_tokens.append(node.surface)
        
        #go to next item
        node = node.next

    token_counter = collections.Counter(noun_tokens)
    sorted_counts = sorted(token_counter.items(), key=operator.itemgetter(1), reverse=True)
    
    #use this code if you want to set at 50 for every text
    sum_top_50 = sum([item[1] for item in sorted_counts[0:50]])  #sum freqs of top 50 words
    C = sum_top_50 / (2 * sum(token_counter.values()))  #divide by 2 * total words
    
    #use this code if you want the MAX to be a relative percent of vocabulary (e.g. 50%)
    #MAX = 10   #start at a reasonably low value
    
    #while the count of types from 1 to MAX is less than x% of the vocabulary...keep adding to MAX
    #while sum([item[1] for item in sorted_counts[0:MAX]]) < (sum(token_counter.values()) * .5):
    #    MAX += 1
    #C = sum([item[1] for item in sorted_counts[0:MAX]]) / (2 * sum(token_counter.values()))
    
    return (k, i, C)

##############################################
# Functions for Most Distinctive Word Analysis
##############################################

def count_differences(one_tokens, two_tokens):
    #calculate total number of tokens
    one_N=len(one_tokens)
    two_N=len(two_tokens)
    
    #create holders for word counts
    one_counts=Counter()
    two_counts=Counter()
    
    #create empty dictionary for vocab items and count all types
    vocab={}
    for token in one_tokens:
        one_counts[token]+=1
        vocab[token]=1
        
    for token in two_tokens:
        two_counts[token]+=1    
        vocab[token]=1
        
    #calculate differences in usage for every vocab item
    differences={}
    for word in vocab:
        freq1=one_counts[word]/one_N
        freq2=two_counts[word]/two_N
        
        diff=freq1-freq2
        differences[word]=diff
        
    return differences

def difference_of_proportions(one_tokens, two_tokens):

    differences=count_differences(one_tokens, two_tokens)
    
    #sort and print most distinctive words
    sorted_differences = sorted(differences.items(), key=operator.itemgetter(1))
    print ("More Corpus A:")
    for k,v in reversed(sorted_differences[-30:]):
        print ("%s\t%s" % (k,v))
    print("\nMore Corpus B:")
    for k,v in sorted_differences[:30]:

        print ("%s\t%s" % (k,v))

# convert a sequence of tokens into counts for each chunkLength-word window
def get_chunk_counts(tokens, chunkLength):
    chunks=[]
    for i in range(0, len(tokens), chunkLength):
            counts=Counter()
            for j in range(chunkLength):
                if i+j < len(tokens):
                    counts[tokens[i+j]]+=1
            chunks.append(counts)
    return chunks

# calculate mann-whitney test for each word in vocabulary
def mann_whitney(one_tokens, two_tokens):

    chunkLength=500
    one_chunks=get_chunk_counts(one_tokens, chunkLength)
    two_chunks=get_chunk_counts(two_tokens, chunkLength)
    
    # vocab is the union of terms in both sets
    vocab={}
    
    for chunk in one_chunks:
        for word in chunk:
            vocab[word]=1
    for chunk in two_chunks:
        for word in chunk:
            vocab[word]=1
    
    pvals={}
    
    for word in vocab:
        
        a=[]
        b=[]
        
        # Note a and b can be different lengths (i.e., different sample sizes)
        # 
        # See Mann and Whitney (1947), "On a Test of Whether one of Two Random 
        # Variables is Stochastically Larger than the Other"
        # https://projecteuclid.org/download/pdf_1/euclid.aoms/1177730491
        
        # (This is part of their innovation over the case of equal sample sizes in Wilcoxon 1945)
        
        for chunk in one_chunks:
            a.append(chunk[word])
        for chunk in two_chunks:
            b.append(chunk[word])

        statistic,pval=mannwhitneyu(a,b, alternative="two-sided")
        
        # We'll use the p-value as our quantity of interest.  [Note in the normal appproximation
        # that Mann-Whitney uses to assess significance for large sample sizes, the significance 
        # of the raw statistic depends on the number of ties in the data, so the statistic itself
        # isn't exactly comparable across different words]
        pvals[word]=pval

    return pvals
    
# calculate mann-whitneyfor each word in vocabulary and present the top 10 terms for each group
def mann_whitney_analysis(one_tokens, two_tokens):
    
    pvals=mann_whitney(one_tokens, two_tokens)
    
    # Mann-Whitney tells us the significance of a term's difference in two groups, but we also 
    # need the directionality of that difference (whether it's used more by group A or group B. 
    
    # Let's use our difference-in-proportions function above to check the directionality.  
    # [Note we could also measure directionality by checking whether the Mann-Whitney statistic
    # is greater or less than the mean=len(one_chunks)*len(two_chunks)*0.5.]

    differences=count_differences(one_tokens, two_tokens)
    
    one_terms={k : pvals[k] for k in pvals if differences[k] <= 0}
    two_terms={k : pvals[k] for k in pvals if differences[k] > 0}
    
    sorted_pvals = sorted(two_terms.items(), key=operator.itemgetter(1))
    print("More Corpus A:\n")
    for k,v in sorted_pvals[:30]:
        print("%s\t%.15f" % (k,v))

    print("\nMore Corpus B:\n")
    sorted_pvals = sorted(one_terms.items(), key=operator.itemgetter(1))
    for k,v in sorted_pvals[:30]:
        print("%s\t%.15f" % (k,v))

def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

def chi_square(one_counts, two_counts, shared_only=False):

    one_sum=0.
    two_sum=0.
    vocab={}
    for word in one_counts:
        one_sum+=one_counts[word]
        vocab[word]=1
    for word in two_counts:
        vocab[word]=1
        two_sum+=two_counts[word]

    N=one_sum+two_sum
    vals={}
    
    if shared_only == True:
        #only analyze words held in common
        for word in vocab:
            if word in one_counts and word in two_counts:
                O11=one_counts[word]
                O12=two_counts[word]
                O21=one_sum-one_counts[word]
                O22=two_sum-two_counts[word]
        
                # We'll use the simpler form given in Manning and Schuetze (1999) 
                # for 2x2 contingency tables: 
                # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf, equation 5.7
        
                vals[word]=(N*(O11*O22 - O12*O21)**2)/((O11+O12)*(O11+O21)*(O12+O22)*(O21+O22))
    else:
        for word in vocab:
            O11=one_counts[word]
            O12=two_counts[word]
            O21=one_sum-one_counts[word]
            O22=two_sum-two_counts[word]
        
            # We'll use the simpler form given in Manning and Schuetze (1999) 
            # for 2x2 contingency tables: 
            # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf, equation 5.7
        
            vals[word]=(N*(O11*O22 - O12*O21)**2)/((O11+O12)*(O11+O21)*(O12+O22)*(O21+O22))
        
    sorted_chi = sorted(vals.items(), key=operator.itemgetter(1), reverse=True)
    one=[]
    two=[]
    for k,v in sorted_chi:
        if one_counts[k]/one_sum > two_counts[k]/two_sum:
            one.append(k)
        else:
            two.append(k)
    
    print ("Corpus A:\n")
    for k in one[:50]:
        print("%s\t%s" % (k,vals[k]))

    print ("\n\nCorpus B:\n")
    for k in two[:50]:
        print("%s\t%s" % (k,vals[k]))

### Create DataFrame

In [5]:
#Read in corpus title information and store as DataFrame; Filter unnecessary columns
df = pd.read_excel(r'Data\Ch3CorpusMetadata.xlsx', sheet_name='Sheet1')
df.shape

(210, 14)

### Perform Most Distinctive Word Analysis

<p>This section uses code adapted from David Bamman's "Course repo for Applied Natural Language Processing": </p>

- https://github.com/dbamman/anlp19/blob/master/2.distinctive_terms/CompareCorpora.ipynb

In [5]:
#subset the main dataframe into each sub-corpus
inovel_df = df[df['GENRE'] == "SHISHOSETSU"]
popular_df = df[df['GENRE'] == "POPULAR"]
junbun_df = df[df['GENRE'] == "JUNBUNGAKU"]

CORPUS_PATH = r'Texts\\'

# create lists of all tokens for each sub-corpus
inovel_tokens = get_tokens(inovel_df, CORPUS_PATH)
popular_tokens = get_tokens(popular_df, CORPUS_PATH)
junbun_tokens = get_tokens(junbun_df, CORPUS_PATH)

In [12]:
##############################################################
#calculate a simple difference of proportions on two corpora
##############################################################

difference_of_proportions(inovel_tokens, popular_tokens)

###########################################################################
#perform a Mann-Whitney rank-sum test to account for burstiness of language
###########################################################################

#mann_whitney_analysis(inovel_tokens, popular_tokens)

###############################################################################
#perform a chi-square test analysis to get statistically significant difference
################################################################################

#inovel_counts = get_counts(inovel_tokens)
#popular_counts = get_counts(popular_tokens)
#junbun_counts = get_counts(junbun_tokens)

#chi_square(inovel_counts, popular_counts, shared_only=True)

More Corpus A:
た	0.008464677355388743
て	0.006348733243627211
に	0.00539732011647534
。	0.003565800953577933
も	0.003134316006738769
は	0.002908110455997044
私	0.00250026484079847
し	0.002369025702906877
自分	0.0021595991966951887
来	0.001963810225293574
い	0.0017961961325462916
彼	0.0016827352988106692
葉子	0.0014905658319116402
こと	0.0014085543137520875
さん	0.0013171134646960867
から	0.0011927711984728003
言っ	0.0011483140031405847
やう	0.0011224396509807028
な	0.0010997662683195682
岸本	0.0010863011813830958
彼女	0.0010756988275525102
家	0.0010287723885635328
を	0.0009676742981517515
時	0.000894351460903489
見	0.0008481075088779251
や	0.0008308829712061142
なかっ	0.0007923053309747867
さう	0.0007573782395027515
行っ	0.000731121440607054
たり	0.0007079315249316752

More Corpus B:
、	-0.025359910287495617
―	-0.003444489353164884
」	-0.0028542691337580823
「	-0.002777770822428416
…	-0.0023036701867795344
！	-0.0015075652594994339
が	-0.0014444628133618402
ます	-0.001185553164912914
と	-0.0011157303634891091
この	-0.0011040045505926826


In [14]:
inovel_counts = get_counts(inovel_tokens)
popular_counts = get_counts(popular_tokens)
junbun_counts = get_counts(junbun_tokens)

chi_square(inovel_counts, popular_counts, shared_only=True)

Corpus A:

葉子	3741.239886097401
自分	3456.3674316629695
やう	3163.2627825891313
私	3026.194237013213
岸本	2901.6842687501066
た	2755.4908197530517
さう	2143.1128902143737
言っ	2141.6441496370862
来	1980.0987123349432
彼	1938.3137604336773
いふ	1861.5259694500432
彼女	1817.2414585661218
三吉	1525.780062322704
て	1520.363199281394
う	1384.6787104347711
あつ	1365.7634129543894
伸子	1331.6143705216043
家	1190.650884222771
に	1110.5784590973012
さん	1107.820461253785
姉	1101.4741088148708
なつ	1056.9478080343765
子供	1012.0081548945462
かつ	1007.7519688871726
母	995.9967646566339
好い	969.8454879150677
も	946.9141647519618
つた	936.7628155286263
母親	931.7803450410457
たり	927.5449187514165
其	912.9679399219257
成っ	912.3537874524854
細君	908.4810266148512
い	817.3760798138065
心	806.7149115206053
叔父	781.0488942691899
居	738.8683554466102
島	689.0937401022709
し	670.5312312548826
や	660.3146400878035
行っ	648.9603887750385
日	638.5831944355741
父	630.3019017852066
行つ	625.8642350422926
見	623.848189880811
なぞ	607.9131084244331
時	607.1290384633511
こと	600.

### Extract Features
<p style='text-align: justify;'>The following cell calculates a portion of the total features contained in the overall model. The other features, particularly those related to lexical diversity and entropy, are calculated in a separate R file (GetFeatures.R). After running both, they must be joined manually into a single spreadsheet for analysis in R.</p>

In [None]:
#Add columns for each feature to be extracted
df = df.dropna(subset=['WORK_ID'])   #drop any empty rows at end of csv
df['pronouns'] = Series('',index=df.index)
df['thought'] = Series('',index=df.index)
df['textlength'] = Series('',index=df.index)  #will capture text lenght feature in this cell
df['dialogue'] = Series('',index=df.index)
df['YulesK'] = Series('',index=df.index)
df['YulesI'] = Series('',index=df.index)
df['GuiraudC'] = Series('',index=df.index)
df['conjuncts'] = Series('',index=df.index)

#Point to folder where tokenized texts are stored
CORPUS_PATH = r'Texts\\'

#Iterate through all texts and extract features
for k in df.index:
    #get the tokenized text
    source_text = CORPUS_PATH + str(df.WORK_ID[k]) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    tokenized = raw_text.read()            
    
    #do some preprocessing
    tokenized = normalize_quotation(tokenized)  #make dialogue markers consistent
    
    #create untokenized version for certain functions
    untokenized = re.sub(r'\s', '', tokenized)
        
    #calculate text length
    df.at[k, 'textlength'] = len(untokenized)
    
    #calculate amount of dialogue
    df.at[k, 'dialogue'] = percent_dialogue(untokenized)
    
    #calculate use of first/third person singular pronouns per sentence
    df.at[k, 'pronouns'] = pronouns(tokenized)
    
    #calculate thought/feeling words
    df.at[k, 'thought'] = thought(untokenized)
    
    #calculate amount of conjunctions
    df.at[k, 'conjuncts'] = conjuncts(untokenized)
    
    #strip punctuation for the next metrics
    tokenized = remove_punc(tokenized)
        
    #calculate alternative metrics
    df.at[k, 'YulesK'], df.at[k, 'YulesI'], df.at[k, 'GuiraudC'] = get_metrics(tokenized)
    
print("Processed " + str(k) + " files")

### Print Results to Excel File

In [52]:
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'Results\python_extracted_features_temp.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()

### Most Distinctive Words in Low versus High Entropy I-Novel Segments

In [6]:
#identify files where chunks are stored
lowent_file = r"Results\LowEntJP.txt"
highent_file = r"Results\HighEntJP.txt"

#clean and segment texts
raw = open(lowent_file, encoding="utf-8")
raw = raw.read()
raw = remove_punc(raw)
raw = re.sub(r'\n\n', ' ', raw)
lowent_tokens = re.split(r' ', raw)

raw = open(highent_file, encoding="utf-8")
raw = raw.read()
raw = remove_punc(raw)
raw = re.sub(r'\n\n', ' ', raw)
highent_tokens = re.split(r' ', raw)

In [5]:
lowent_counts = get_counts(lowent_tokens)
highent_counts = get_counts(highent_tokens)

chi_square(lowent_counts, highent_counts, shared_only=True)

Corpus A:

自分	706.1165592733796
葉子	536.1075912981897
節子	413.0830488376388
岸本	386.2202708914681
こと	372.54725007840517
私	326.6784203403483
た	321.28330285422146
い	311.08704350919453
僕	298.1244569781743
て	245.09041418746446
いる	243.95155539634825
なかっ	225.0414831298667
倉地	223.72963627947811
それ	217.91662742723412
鶴	204.61934483093697
その	203.57436429299827
思っ	194.48633581315667
いっ	180.08960389765693
事	176.61025827786648
だっ	168.01504067535515
は	160.0586214236515
いう	142.82615325959543
柳沢	141.60228156517195
彼	141.11430961978004
庸三	123.14606248291788
宮	117.92719671265124
しかし	115.86096915291952
ない	114.85017469710183
に	114.16456581793706
よう	114.14766401850966
父	112.96121312004487
叔父	111.53520621139434
須山	110.29175255834028
あっ	109.63838289547652
然し	101.52968594351951
義雄	101.25784248947815
書い	95.61663819220952
児島	94.60439673365246
よこし	93.06872888415718
の	90.93068191525252
行っ	88.80795800119355
あなた	86.97916687552245
しまっ	84.8838988929875
ある	83.10674038362012
繁	79.17402996488215
岡	73.1111524448354
心	70.50

In [34]:
#calculate average length of passage
import numpy as np

lowent_file = r"Results/LowEntJP.txt"
char_length = []

#clean and segment texts
raw = open(lowent_file, encoding="utf-8")
raw = raw.read()
chunks = re.split(r'\n\n', raw)
for chunk in chunks:
    no_space = re.sub(r' ', '', chunk)
    chars = list(no_space)
    char_length.append(len(chars))
    
print(np.mean(char_length), np.mean(char_length)/400)

1482.41131498 3.70602828746
