In [1]:
#### imports

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import stats
import os
import codecs
import sys
import re
from shutil import copyfile
from __future__ import division
#from __future__ import print_function
#sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
#sys.stdin = codecs.getreader('utf_8')(sys.stdin)
import MeCab  #CHECK "MECABRC" FILE TO SEE WHICH DICTIONARY YOU ARE USING
mecab = MeCab.Tagger("")  #using unidic
#mecab = MeCab.Tagger("-Ochasen")  #using MeCab's ipadic
import collections
import operator
import nltk
import math

In [2]:
#some cleaning and pre-processing functions for Japanese corpus

#this first one should only be used for non-tokenized texts; basically cleans them for tokenization step
def strip_chap_titles(raw):
    #get rid of chapter titles that use Chinese numbers with or without surronding parantheses
    raw = re.sub(r'（*([一二三四五六七八九十])+(）)*\n', '', raw)
    #get rid of chapter titles that use utf-8 alpha-numeric numbers
    raw = re.sub(r'[１-９]+\n', '', raw)
    raw = re.sub(r'[第弐拾章参壱一二三四五六七八九十]+\n', '', raw)
    raw = re.sub(r'『', r'「', raw)   #replace all 『 with 「
    raw = re.sub(r'』', r'」', raw)   #replace all 』 with 」
    raw = re.sub(r'\n', '', raw)  #strips all newlines
    raw = re.sub(r'\r', '', raw)  #strips all returns
    #raw = re.sub(r'\s', '', raw)
    return raw

#might need to run this separately for already tokenized files
def bracket_cleaner(raw):
    raw = re.sub(r'［[^］]+］', '', raw)   #replace annotations in brackets ([#...])
    raw = re.sub(r'\s+', ' ', raw)                         #get rid of double spaces
    return raw

puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','<','〔','〕',
         '＃','△','※','＊']

def cleaner(text):
    for punc in puncs:
        text = re.sub(punc, '', text)
    text = re.sub(r'\s+', ' ', text)                         #get rid of double spaces
    return text

def remove_stopwords(tokens, stopwords):
    new_list = [token for token in tokens if token not in stopwords]
    return new_list

#this function computes percentage of text that is dialogue
def percent_dialogue(text):
    no_quotes = re.sub(r'「[^」]*」', '', text)   #eliminate all dialogue passages for single bracket quotes
    per_dialogue = (len(text)-len(no_quotes))/len(text)
    return per_dialogue

def punct(text):
    punctuation = ['、','。','…','！','？']
    count = 0
    
    #strip dialogue
    no_dialogue = re.sub(r'「[^」]*」', '', text)
    
    #search and tabulate first and third person usage
    for word in punctuation:
        instances = re.findall(word, no_dialogue)
        count += len(instances)

    return count/len(no_dialogue)

def geo_words(text):
    path = r'C:\Users\Hoyt\Dropbox\SemanticsRace\Empire_Place_Names.txt'
    f = open(path, encoding='utf-8')
    words = f.read()
    geo_words = re.split(r'\n', words)
    
    count = 0
    
    #grab instances of "colonies" first
    instances = re.findall(r'植民　地', text)
    count += len(instances)
    
    #tokenize text and search for geographic names
    tokens = re.split(r'\s', text)
    
    for word in geo_words:
        if word in tokens:
            count += tokens.count(word)
    
    return count    
    
def race_words(text, race_terms):
    #race_words = ['西洋人','外国人']    #'朝鮮人','中国人','西洋人','黒人','部落民','土人','東洋人','外国人','アイヌ-Ainu']
    count = 0
    
    #search and tabulate
    for word in race_terms:
        instances = re.findall(word, text)
        count += len(instances)
    
    #tokens = re.split(r'\s', text)

    return count  #/len(tokens)

def get_stopwords(path):
    f = open(path, encoding='utf-8')
    words = f.read()
    return re.split(r'\n', words)

In [2]:
#LOAD in metadata for the kindai zasshi corpus

df = pd.read_excel(r'Kindai_Meta.xlsx', sheetname='Sheet1')
#df = df[df['YEAR'] < 1960]
#df = df[df['YEAR'] > 1875]
df = df[df['FILTER'] != 'YES']

#add features that you are measuring
#df['GEO_WORDS'] = Series('',index=df.index)
df['RACEWORDS'] = Series('',index=df.index)
df['JAPANESE'] = Series('',index=df.index)

df.shape

(8867, 9)

## Get Token and Type Count

In [8]:
df['TOKENS'] = Series('', index=df.index)

CORPUS_PATH = './KindaiLemmaMerge/'
all_tokens = []

for k in df.index:
    #get the tokenized text
    source_text = CORPUS_PATH + str(df.FILE_ID[k])
    raw_text = open(source_text, encoding="utf-8")       #grab text
    text = raw_text.read()
    
    #remove punctuation
    text = cleaner(text)
    
    #split the text into a list of individual tokens
    tokens = re.split(r' ', text)
    
    #keep track of tokens per document
    df.at[k, 'TOKENS'] = len(tokens)
    
    #add to global list
    all_tokens += tokens 

print(len(all_tokens))
types = set(all_tokens)
print(len(types))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


21964304
122951


In [12]:
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\SemanticsRace\KindaiMetaTemp.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()

# Count selected keywords in the corpus

In [8]:
###########################################
#code to COUNT selected keywords in corpus
############################################

CORPUS_PATH = r"KindaiLemmaMerge\\"

#keywords_dict = {'中国人':[0,0], '中国 人':[0,0], '中国 民族':[0,0],'中国 民衆':[0,0],'中国 の 民衆':[0,0],'中国 の 人':[0,0],
#                 '中国 の 人々':[0,0],'シナ 人':[0,0],'支那 人':[0,0],'支那 の 人':[0,0],'支那 の 民族':[0,0],
#                 '支那 の 民衆':[0,0], '華僑':[0,0]}

#keywords_dict = {'朝朝鮮人':[0,0], '朝鮮 人':[0,0],'朝鮮 民族':[0,0],'朝鮮 の 人':[0,0],'朝鮮 の 人々':[0,0],'朝鮮 民族':[0,0],
#                 '鮮人':[0,0],'在日':[0,0], '韓国 人':[0,0], '韓人':[0,0]}

#keywords_dict = {'西洋 人':[0,0],'西洋 の 人':[0,0],'白人':[0,0],'ハイカラ':[0,0],'毛唐':[0,0],'外国 人':[0,0],'外人':[0,0],
#                 '欧州 人':[0,0],'オウシュウ-外国 人':[0,0],'外 人':[0,0]}

#keywords_dict = {'日本 人':[0,0],'日本 の 人々':[0,0],'日本 の 人':[0,0],'大和 民族':[0,0],'大和 （ やまと ） 民族':[0,0],
#                 '日本 民族':[0,0],'日本 の 民族':[0,0],'日本 民衆':[0,0],'日本 の 民衆':[0,0],'日本 国民':[0,0],
#                 '日本 の 国民':[0,0],'ジャップ':[0,0],'和人':[0,0],'内地 人':[0,0],'日系':[0,0], '邦人':[0,0]}

#keywords_dict = {'穢 多':[0,0],'非人':[0,0],'新 平民':[0,0],'部落 民':[0,0],'平民':[0,0],'アイヌ':[0,0],'偉人':[0,0],
#                 '蝦夷':[0,0],'気違い':[0,0],'リュウキュウ 人':[0,0],'オキナワ 人':[0,0],'オキナワ の 人':[0,0],'四足':[0,0]}

#keywords_dict = {'黒人':[0,0], '黒ん坊':[0,0], 'ニグロ':[0,0], 'アフリカ 人':[0,0], '土人':[0,0],'混血':[0,0]} 

#keywords_dict = {'アジア 人':[0,0],'東洋 人':[0,0],'東洋 の 人':[0,0],'黄 人':[0,0],'黄 人種':[0,0],'有色 人種':[0,0],
#                 '異 人種':[0,0]}

#keywords_dict = {'蕃人':[0,0], '蛮族':[0,0], '生蕃':[0,0], '熟蕃':[0,0], 'タカサゴ 族':[0,0], '本島 人':[0,0],
#                 '蛮人':[0,0], '野蛮 人':[0,0]}

#keywords_dict = {'朝鮮人':[0,0], '中国人':[0,0], '西洋人':[0,0], '日本人':[0,0], '黒人':[0,0], '部落民':[0,0],
#                 '土人':[0,0], '東洋人':[0,0], '外国人':[0,0]}

keywords_dict = {'漢人':[0,0], '夷狄':[0,0], '韓民':[0,0], '清国 人':[0,0],'清国 民':[0,0], '夷':[0,0], 'チュウゴク 人':[0,0]}
                                 
#iterate through all texts and count keywords; also keep track of number of texts in which keyword appears
for k in df.index:
    #get the tokenized text
    source_text = CORPUS_PATH + df.FILE_ID[k]
    raw_text = open(source_text, encoding="utf-8")       #grab text
    raw = raw_text.read()
    
    for key in keywords_dict:
        count = re.findall(key, raw)
        keywords_dict[key][0] += len(count)  #first element is total count
        #keep track of number of docs
        if len(count) > 0:
            keywords_dict[key][1] += 1      #second element is doc count
            #print(df.FILE_ID[k])
        
print(keywords_dict)

{'漢人': [64, 32], '夷狄': [57, 25], '韓民': [18, 9], '清国 人': [75, 47], '清国 民': [12, 9], '夷': [658, 284], 'チュウゴク 人': [43, 26]}


# Remove spaces between key race terms

In [9]:
# condense key race terms into single word units

CORPUS_PATH = r"kindai_lemma_raw\\"
OUTPUT_PATH = r"kindai_lemma_merge_race_terms\\"

#keep in mind that order is important in some of these cases (e.g., tuples 3 and 4)

race_terms = [('朝鮮 人', '朝鮮人'), ('朝鮮 民族', '朝鮮民族'), ('朝鮮 の 人 々','朝鮮の人々'), ('朝鮮 の 人','朝鮮の人'),
             ('朝鮮 の 人達','朝鮮の人達'),('半島 の 人達','半島の人達'),('鮮人 達','鮮人達'),('セン 女','鮮女'),('半島 人','半島人'),
             ('中国 人', '中国人'), ('中国 民族','中国民族'),('中国 民衆', '中国民衆'),('中国 の 民衆','中国の民衆'),
             ('中国 の 人 々','中国の人々'),('中国 の 人','中国の人'),('支那 人','支那人'),('支那 の 人','支那の人'),
             ('支那 の 民族','支那の民族'), ('支那 の 民衆','支那の民衆'), ('西洋 人','西洋人'), ('西洋 の 人','西洋の人'),
             ('西洋 の 人　々','西洋の人々'), ('オウシュウ-外国 人','欧州人'), ('外国 人','外国人'), ('日本 人','日本人'), 
             ('日本 の 人 々','日本の人々'), ('日本 の 人','日本の人'), ('ヤマト 民族','ヤマト民族'), ('日本 民族','日本民族'),
             ('日本 の 民族','日本の民族'), ('日本 民衆','日本民衆'), ('日本 の 民衆','日本の民衆'),('日本 国民','日本国民'),
             ('日本 の 国民','日本の国民'), ('ジャップ-Jap', 'ジャップ'), ('内地 人','内地人'),('新 平民','新平民'),('部落 民','部落民'),
             ('アフリカ-Africa 人', 'アフリカ-Africa人'), ('東洋 人','東洋人'),('東洋 の 人','東洋の人'),('黄 人','黄人'),
             ('黄 人種','黄人種'),('有色 人種','有色人種'), ('アジア-Asia 人','アジア-Asia人'),('野蛮 人','野蛮人'),
             ('本島 人','本島人'),('タカサゴ 族','タカサゴ族'),('リュウキュウ 人','リュウキュウ人'), ('オキナワ 人','オキナワ人')]

altered_docs = 0
change = 0

for k in df.index:
    #get the text
    source_text = CORPUS_PATH + df.FILE_ID[k]
    raw_text = open(source_text, encoding="utf-8")       #grab text
    raw = raw_text.read()
    
    for pair in race_terms:
        if re.search(pair[0], raw):
            change = 1
            raw = re.sub(pair[0], pair[1], raw)  #replace all instances of the race term with condensed version
    
    if change == 1:
        altered_docs += 1
    
    #now print back out
    with open(OUTPUT_PATH + df.FILE_ID[k], "w", encoding="utf-8") as f:
        f.write(raw)
        f.close()
        
    change = 0

print(altered_docs)

2890


# Unify Race Terms into Single Race Terms

In [22]:
# Merge the race terms into single terms in the corpus

CORPUS_PATH = r"kindai_lemma_merge_race_terms\\"

unified_terms = ['朝鮮人','中国人','西洋人','外国人','日本人','部落民','黒人','東洋人','土人']
#set group number based on element in above list
race_group = 8

#note that I have already eliminated spaces for these words in the above cell; 
#otherwise you would need to include spaces according to how unidic tokenizes these words 

all_merge_terms = [['朝鮮民族','朝鮮の人々','朝鮮の人',' 鮮人','半島人','朝鮮の人達','半島の人達','鮮人達','鮮女','在日','ヨボ',
                    '韓人'],
                   ['中国民族','中国民衆','中国の民衆','中国の人','中国の人々','シナ 人','支那人','支那の人','支那の民族',
                    '支那の民衆','華僑'],
                   ['西洋の人','白人','毛唐','欧州人','西洋の人々'],
                   ['外人','異人'],
                   ['日本の人々','日本の人','大和民族','日本民族','日本の民族','日本民衆','日本の民衆','日本国民',
                    '日本の国民','ジャップ','和人','内地人','邦人'],
                   ['穢多','非人','新平民'],
                   ['黒ん坊','アフリカ-Africa人','ニグロ'],
                   ['東洋の人','黄人','黄人種','有色人種','アジア-Asia人'],
                   ['蕃人','蛮族','蕃族','生蕃','熟蕃','タカサゴ族','本島人',' 蛮人','野蛮人']]

#select the set of merge terms based on race group
merge_terms = all_merge_terms[race_group]

#initialize some counters
altered_docs = 0
change = 0

for k in df.index:
    #get the text
    source_text = CORPUS_PATH + df.FILE_ID[k]
    raw_text = open(source_text, encoding="utf-8")       #grab text
    raw = raw_text.read()
    
    #insert exceptions for 鮮人 and 蛮人 since they require an aditional space in front
    for word in merge_terms:
        if re.search(word, raw):
            if word == ' 鮮人':
                change = 1
                raw = re.sub(word, ' ' + unified_terms[race_group], raw)
            elif word == ' 蛮人':
                change = 1
                raw = re.sub(word, ' ' + unified_terms[race_group], raw)
            else:
                change = 1
                raw = re.sub(word, unified_terms[race_group], raw)
    
    if change == 1:
        altered_docs += 1
    
    #now print back out
    with open(CORPUS_PATH + df.FILE_ID[k], "w", encoding="utf-8") as f:
        f.write(raw)
        f.close()
        
    change = 0

print(altered_docs)

183


In [8]:
#if you're working on unidic tokenized corpus
CORPUS_PATH = r"KindaiLemmaMerge\\"

race_terms = ['朝鮮人', '中国人', '西洋人','部落民','土人']
japanese = ['日本人']

for k in df.index:
    #get the tokenized text
    try:
        source_text = CORPUS_PATH + df.FILE_ID[k]
        raw_text = open(source_text, encoding="utf-8")       #grab text
        raw = raw_text.read()
        #df.GEO_WORDS.loc[k] = geo_words(raw)
        df.RACEWORDS.loc[k] = race_words(raw, race_terms)
        df.JAPANESE.loc[k] = race_words(raw, japanese)
    except:
        #df.GEO_WORDS.loc[k] = 0
        df.RACEWORDS.loc[k] = 0
        df.JAPANESE.loc[k] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
df[5000:5010]

Unnamed: 0,FILE_ID,MAGAZINE,YEAR,TITLE,LENGTH,FILTER,TOKENS,RACEWORDS,JAPANESE
19276,19276.txt,太陽,1917,10-018_聴診器の響_口語_b,1096,,1024.0,0,0
19277,19277.txt,太陽,1917,10-019_戦時の伯剌西爾と日本_口語_b,4804,,4494.0,0,2
19278,19278.txt,太陽,1917,10-020_新刊紹介_口語_b,369,,337.0,0,0
19279,19279.txt,太陽,1917,10-021_強国と成る可き根本大策（工業教育の振興）_口語_b,10518,,9998.0,0,3
19280,19280.txt,太陽,1917,10-022_新刊紹介_口語_b,187,,161.0,0,0
19281,19281.txt,太陽,1917,10-023_羅馬法皇の講和提議_口語_b,1370,,1268.0,0,0
19282,19282.txt,太陽,1917,10-024_日英の経済的関係改善論_口語_b,4413,,4066.0,0,2
19283,19283.txt,太陽,1917,10-025_噂の立ちぎき_口語_b,678,,639.0,0,0
19284,19284.txt,太陽,1917,10-026_新帝国技芸員評判記（下）_口語_b,1384,,1251.0,0,0
19285,19285.txt,太陽,1917,10-027_新刊紹介_口語_b,1270,,1156.0,0,2


In [23]:
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\SemanticsRace\Temp.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [103]:
all_terms = keywords_dict

In [72]:
all_terms = {**korea, **china, **west, **japan, **other, **black, **asia}  #merge all dictionaries
print(all_terms)

{'朝鮮 人': [147, 40], '朝鮮 民族': [4, 1], '朝鮮 の 人': [9, 7], '朝鮮 の 人々': [0, 0], '鮮人': [33, 10], '在日': [1, 1], '中国人': [221, 39], '中国 民族': [2, 2], '中国 民衆': [0, 0], '中国 の 民衆': [2, 2], '中国 の 人': [17, 7], '中国 の 人々': [5, 2], 'シナ 人': [4, 4], '支那 人': [612, 109], '支那 の 人': [8, 4], '支那 の 民族': [2, 2], '支那 の 民衆': [7, 3], '西洋 人': [342, 139], '西洋 の 人': [7, 6], '白人': [408, 53], 'ハイカラ': [288, 134], '毛唐': [257, 61], '外国 人': [394, 137], '外人': [283, 73], '欧州 人': [2, 2], '日本人': [1843, 310], '日本 の 人々': [5, 4], '日本 の 人': [50, 34], '大和 民族': [22, 14], '大和 （ やまと ） 民族': [0, 0], '日本 民族': [56, 21], '日本 の 民族': [2, 2], '日本 民衆': [1, 1], '日本 の 民衆': [2, 2], '日本 国民': [39, 26], '日本 の 国民': [5, 5], 'ジャップ': [14, 7], '和人': [11, 2], '内地 人': [31, 8], '日系': [98, 7], '穢 多': [152, 7], '非人': [261, 108], '新 平民': [52, 4], '部落 民': [13, 5], '平民': [289, 74], 'アイヌ': [162, 24], '偉人': [84, 50], '蝦夷': [222, 43], '気違い': [292, 119], '琉球 人': [8, 5], '沖縄 人': [2, 1], '沖縄 の 人': [0, 0], '四足': [52, 38], '黒人': [424, 56], '黒ん坊': [107, 31], 'アフリカ 人': [2, 

In [29]:
terms = DataFrame.from_dict(all_terms, orient='index')
terms.columns = ['total_count', 'num_of_docs']

import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\SemanticsRace\Race_Terms_Unified.xlsx', engine='xlsxwriter')
terms.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [11]:
CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\SemanticsRace\UnidicCorpus\\"
source_text = CORPUS_PATH + str(df.WORK_ID[0]) + ".txt"
raw_text = open(source_text, encoding="utf-8")       #grab text
raw = raw_text.read()

if re.search(r'怪談', raw):
    print("true")

true


# Merge Race Terms into Unified Terms

In [11]:
# Merge the race terms into single terms in the corpus

CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\SemanticsRace\UnidicCorpusLemma\\"

unified_term = '朝鮮人'

merge_terms = ['半島 人']

#merge_terms = ['朝鮮 人', '朝鮮 民族', '朝鮮 の 人', '朝鮮 の 人 々', '鮮人']

#merge_terms = ['中国 人','中国 民族','中国 民衆','中国 の 民衆','中国 の 人','中国 の 人々','シナ 人','支那 人','支那 の 人',
#               '支那 の 民族','支那 の 民衆']

#merge_terms = ['中国 人','中国 民族','中国 民衆','中国 の 民衆','中国 の 人','中国 の 人 々','支那 人','支那 の 人',
#               '支那 の 民族','支那 の 民衆']

#merge_terms = ['西洋 人','西洋 の 人','白人','毛唐','オウシュウ-外国 人']    #'欧州 人']
#merge_terms = ['外国 人','外人','異人']

#merge_terms = ['日本 人','日本 の 人々','日本 の 人','大和 民族','大和 （ やまと ） 民族','日本 民族','日本 の 民族','日本 民衆',
#               '日本 の 民衆','日本 国民','日本 の 国民','ジャップ','和人','内地 人']

#merge_terms = ['日本 人','日本 の 人 々','日本 の 人','ヤマト 民族','ヤマト （ ヤマト ） 民族','日本 民族','日本 の 民族','日本 民衆',
#               '日本 の 民衆','日本 国民','日本 の 国民','ジャップ-Jap','和人','内 地人']

#merge_terms = ['穢多','非人','新 平民','部落 民']
#merge_terms = ['黒人','黒ん坊','アフリカ-Africa 人']       #'アフリカ 人']     
#merge_terms = ['東洋 人','東洋 の 人','黄 人','黄 人種','有色 人種', 'アジア-Asia 人'] #'アジア 人']
#merge_terms = ['蕃人','蛮族','蕃族','生蕃','熟蕃','タカサゴ 族','高砂 族','本島 人','蛮人','野蛮 人']  #,'土人']

altered_docs = 0
change = 0

for k in df.index:
    #get the text
    source_text = CORPUS_PATH + str(df.WORK_ID[k]) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    raw = raw_text.read()
    
    for word in merge_terms:
        if re.search(word, raw):
            change = 1
            raw = re.sub(word, unified_term, raw)
    
    if change == 1:
        altered_docs += 1
    
    #now print back out
    with open(CORPUS_PATH + str(df.WORK_ID[k]) + ".txt", "w", encoding="utf-8") as f:
        f.write(raw)
        f.close()
        
    change = 0

print(altered_docs)

2


# Condense key race terms into single word units

In [73]:
# condense key race terms into single word units

CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\SemanticsRace\UnidicCorpusLemmaNoMerge\\"

race_terms = [('朝鮮 人', '朝鮮人'), ('朝鮮 民族', '朝鮮民族'), ('朝鮮 の 人 々','朝鮮の人々'), ('朝鮮 の 人','朝鮮の人'),
             ('中国 人', '中国人'), ('中国 民族','中国民族'),('中国 民衆', '中国民衆'),('中国 の 民衆','中国の民衆'),
             ('中国 の 人 々','中国の人々'),('中国 の 人','中国の人'),('支那 人','支那人'),('支那 の 人','支那の人'),
             ('支那 の 民族','支那の民族'), ('支那 の 民衆','支那の民衆'), ('西洋 人','西洋人'), ('西洋 の 人','西洋の人'),
             ('オウシュウ-外国 人','欧州人'), ('外国 人','外国人'), ('日本 人','日本人'), ('日本 の 人 々','日本の人々'),
             ('日本 の 人','日本の人'), ('ヤマト 民族','ヤマト民族'), ('日本 民族','日本民族'),('日本 の 民族','日本の民族'),
             ('日本 民衆','日本民衆'), ('日本 の 民衆','日本の民衆'),('日本 国民','日本国民'),('日本 の 国民','日本の国民'),
             ('内 地人','内地人'),('新 平民','新平民'),('部落 民','部落民'),('アフリカ-Africa 人', 'アフリカ-Africa人'),
             ('東洋 人','東洋人'),('東洋 の 人','東洋の人'),('黄 人','黄人'),('黄 人種','黄人種'),('有色 人種','有色人種'),
             ('アジア-Asia 人','アジア-Asia人'),('野蛮 人','野蛮人')]

altered_docs = 0
change = 0

for k in df.index:
    #get the text
    source_text = CORPUS_PATH + str(df.WORK_ID[k]) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    raw = raw_text.read()
    
    for pair in race_terms:
        if re.search(pair[0], raw):
            change = 1
            raw = re.sub(pair[0], pair[1], raw)  #replace all instances of the race term with condensed version
    
    if change == 1:
        altered_docs += 1
    
    #now print back out
    with open(CORPUS_PATH + str(df.WORK_ID[k]) + ".txt", "w", encoding="utf-8") as f:
        f.write(raw)
        f.close()
        
    change = 0

print(altered_docs)

563


In [24]:
CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\SemanticsRace\UnidicCorpusLemma\\"

all_kana = []

for k in df.index[0:5]:
    #get the tokenized text
    source_text = CORPUS_PATH + str(df.WORK_ID[k]) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    text = raw_text.read()
    
    text = bracket_cleaner(text)  #clean brackets (need to do this first)

    kana = re.findall(r'\s([ア-ンーァィゥェォ]+)\s', text)
    
    all_kana += kana


#produce the frequency list
#fdist = nltk.FreqDist(all_kana)
#freq_pairs = fdist.items()
#sort_freq_pairs = sorted(freq_pairs, key=lambda x:x[1], reverse=True)  #sort by decreasing frequency

#create a dictionary to store word-frequency pairs
#word_freqs = {}

#fill dictionary with pairs
#for item in sort_freq_pairs:
#    word_freqs[item[0]] = item[1]
    
#kana_df = DataFrame.from_dict(word_freqs, orient='index')  #convert dict to dataframe
#kana_df = kana_df.rename(columns={0:'frequency'})    #rename column
#kana_df = kana_df.sort_values(by='frequency', ascending=False)   #sort by frequency

#import xlsxwriter
#import openpyxl
#writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\SemanticsRace\CharNames.xlsx', engine='xlsxwriter')
#kana_df.to_excel(writer, sheet_name='Sheet1')
#writer.save()

all_kana

['ルイコウ',
 'ツキジ',
 'ハラ',
 'オ',
 'オコン',
 'スイ',
 'ルイコウ',
 'ルイコウ',
 'ルイコウ',
 'カムロ',
 'ツキジ',
 'ハラ',
 'ツキジ',
 'ボウ',
 'ツキジ',
 'ボウ',
 'ダイミョウ',
 'ハカタ',
 'トコロ',
 'ヌ',
 'ジョ',
 'ジョ',
 'オソレ',
 'ヒトシ',
 'ハカタ',
 'トウキョウ',
 'オオトモ',
 'オオトモ',
 'ヤ',
 'コク',
 'ア',
 'オオトモ',
 'コク',
 'ケン',
 'コク',
 'コク',
 'コク',
 'コク',
 'モク',
 'オオトモ',
 'コク',
 'コク',
 'コム',
 'ミノル',
 'オオトモ',
 'ヒツ',
 'コク',
 'コク',
 'ヤ',
 'コク',
 'コク',
 'コク',
 'ヒビノ',
 'ハラ',
 'ヒビヤ',
 'ハラ',
 'ハラ',
 'ハラ',
 'ツキジ',
 'コク',
 'コク',
 'コク',
 'ネムリ',
 'コク',
 'コク',
 'ドバ',
 'ドバ',
 'コク',
 'コク',
 'ヌ',
 'オオトモ',
 'ハイポセシス',
 'コク',
 'ヒ',
 'ユウ',
 'コク',
 'ツキジ',
 'ケン',
 'ハカタ',
 'コク',
 'オオトモ',
 'ブン',
 'タクミ',
 'ツキジ',
 'リ',
 'コク',
 'エ',
 'コク',
 'オオトモ',
 'エ',
 'コク',
 'オオトモ',
 'スイ',
 'コク',
 'エー',
 'エ',
 'セン',
 'チョン',
 'コク',
 'コク',
 'ー',
 'コク',
 'コク',
 'ツキジ',
 'ツキジ',
 'チョウ',
 'コク',
 'オコン',
 'オコン',
 'オコン',
 'ツキジ',
 'エ',
 'ケン',
 'クレ',
 'ケン',
 'コク',
 'キャ',
 'コク',
 'イヤサ',
 'コク',
 'リョウ',
 'ボ',
 'オギ',
 'ヌ',
 'オギ',
 'オオトモ',
 'オオトモ',
 'オモウ',
 'オギ',
 'ヤ',
 'ア',
 'ツイ',
 'オギ',
 'オギ',
 'オオトモ',

# Create a Frequency Table for All Words in Corpus

In [105]:
#if you're working on unidic tokenized corpus
CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\SemanticsRace\UnidicCorpusLemma\\"

all_tokens = []

#set this flag before you run
rmv_stopwords = True

if rmv_stopwords == True:
    stopwords = get_stopwords(r'C:\Users\Hoyt\Dropbox\SemanticsRace\stopwords.txt')

for k in df.index:
    #get the tokenized text
    source_text = CORPUS_PATH + str(df.WORK_ID[k]) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    text = raw_text.read()
    
    text = bracket_cleaner(text)  #clean brackets (need to do this first)
    #remove punctuation
    text = cleaner(text)
    
    #split the text into a list of individual tokens
    tokens = re.split(r' ', text)
    #while '' in tokens: tokens.remove('')  #remove blank spaces
    
    if rmv_stopwords == True:
        tokens = remove_stopwords(tokens, stopwords)

    #add to global list
    all_tokens += tokens 

#produce the frequency list
fdist = nltk.FreqDist(all_tokens)
freq_pairs = fdist.items()
sort_freq_pairs = sorted(freq_pairs, key=lambda x:x[1], reverse=True)  #sort by decreasing frequency

# store relative frequencies of words
# need to subtract the number of '' tokens from total, since we will eliminate these
# IF YOU RUN CODE AGAIN, WON'T NEED TO DO THIS -- I FIXED THE CLEANER FUNCTION
#token_count = len(all_tokens) - sort_freq_pairs[0][1]   #assuming that first element is '' token

token_count = len(all_tokens)

#create a dictionary to store word-frequency pairs
word_freqs = {}

#fill dictionary with pairs
for item in sort_freq_pairs:
    word_freqs[item[0]] = item[1]
    
freqs_df = DataFrame.from_dict(word_freqs, orient='index')  #convert dict to dataframe
freqs_df = freqs_df.rename(columns={0:'frequency'})    #rename column
freqs_df = freqs_df.sort_values(by='frequency', ascending=False)   #sort by frequency

#now drop the first element in dictionary: the '' token
#freqs_df = freqs_df.drop(freqs_df.index[0])

#compute relative frequencies
freqs_df['rel_freq'] = freqs_df.frequency / token_count
freqs_df = freqs_df.reset_index()

# print out the results to an excel file
#import xlsxwriter
#import openpyxl
#writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\SemanticsRace\LemmaFreqTable.xlsx', engine='xlsxwriter')
#freqs_df.to_excel(writer, sheet_name='Sheet1')
#writer.save()

In [106]:
print(token_count)
print(freqs_df.shape)

19265996
(114021, 3)


# Produce a concordance for all the Race Terms

In [107]:
# Create concordances for our seed terms

#if you're working on unidic tokenized corpus
CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\SemanticsRace\UnidicCorpusLemma\\"

seed_terms = ['朝鮮人','中国人','西洋人','日本人','黒人','部落民','土人','東洋人','外国人']

window_size = 15
rmv_stopwords = True

if rmv_stopwords == True:
    stopwords = get_stopwords(r'C:\Users\Hoyt\Dropbox\SemanticsRace\stopwords.txt')

#prepare file to print out results
#f = open(r"C:\Users\Hoyt\Dropbox\SemanticsRace\LemmaConcordance.txt", "w", encoding="utf-8")

#set up data structure to hold info
metadata = {'text_id': [], 'seed_term': [], 'index_pos': [], 'left_tokens': [], 'right_tokens': []}

hits = 0

#iterate through all texts and count keywords; also keep track of number of texts in which keyword appears
for k in df.index:
    current_position = 0
    
    #get the text
    text_id = str(df.WORK_ID[k]) + ".txt"
    source_text = CORPUS_PATH + str(df.WORK_ID[k]) + ".txt"
    raw_text = open(source_text, encoding="utf-8")       #grab text
    text = raw_text.read()
    
    text = bracket_cleaner(text)  #clean brackets for non-unicode kanji; need to do this first
    #remove punctuation
    text = cleaner(text)
    
    #tokenize the text
    tokens = re.split(r' ', text)
    
    #remove stopwords
    if rmv_stopwords == True:
        tokens = remove_stopwords(tokens, stopwords)
    
    #iterate through tokens in search of seed_terms
    for token in tokens:
        #check for seed term
        if token in seed_terms:
            hits += 1
            #f.write(text_id + ' ' + token + ' ' + str(current_position) + '\t')
            metadata['text_id'].append(text_id)
            metadata['seed_term'].append(token)
            metadata['index_pos'].append(current_position)
            #check to make sure we are not at start of text
            if current_position - window_size > 0:
                #f.write(' '.join(tokens[current_position-window_size:current_position]) + ' ')
                metadata['left_tokens'].append(' '.join(tokens[current_position-window_size:current_position]))
            else:
                #f.write(' '.join(tokens[0:current_position]) + ' ')
                metadata['left_tokens'].append(' '.join(tokens[0:current_position]))
            
            #f.write('SEED_TERM' + ' ')        
            #check to make sure we are not at end of text
            if current_position + window_size < len(tokens):
                #f.write(' '.join(tokens[current_position+1:current_position+window_size]) + '\n')
                metadata['right_tokens'].append(' '.join(tokens[current_position+1:current_position+window_size]))
            else:
                #f.write(' '.join(tokens[current_position+1:]) + '\n')
                metadata['right_tokens'].append(' '.join(tokens[current_position+1:]))
            
        current_position += 1

#f.close()
print(hits)

6570


In [108]:
concord_df = DataFrame(metadata, columns=['text_id', 'seed_term', 'index_pos', 'left_tokens', 'right_tokens'])
concord_df.shape   #this should be about 6594

(6570, 5)

In [109]:
concord_df = concord_df.sort_values(by = ["seed_term", "text_id", "index_pos"])
concord_df = concord_df.reset_index(drop=True)
concord_df[0:5]

Unnamed: 0,text_id,seed_term,index_pos,left_tokens,right_tokens
0,10000001.txt,中国人,10655,思う 行く 見る ふふん 嘲笑 婦 私-代名詞 顔 見る 言う まあ 良い 種 奴 買う,出る 言う 私-代名詞 固より 良い 気持ち 理由 承知 案外 平気 ナガタ ふふん 橇 うー
1,10000001.txt,中国人,10725,気 うう 負け けち 味 言う 論 うー 言う 冷笑 私-代名詞 却って ナガタ 宥める 良い,癩病 違う 君-代名詞 清浄 素性 分かる まあ 構え 苦笑 間切る 見る 見る 振り 一寸
2,10000015.txt,中国人,46,来る 車 屋 迷惑 そう-様態 言う セン ギン 青い 紙幣 広げる 私-代名詞 掌 戻す 門前,小売り 店 明日 差し入れる 為 白い 塵紙 帖 カ ウ 小さな 銀貨 枚 戾
3,10000015.txt,中国人,3246,見える 担架 死亡 室 行く 広い 庭 回る 私-代名詞 寝台 粗い 格子 担架 担ぐ 行く,弁 髮 尻 辺り ぴんぴん 歩く 度 跳ね返す 見る 中国人 踏む 行く 庭 地面
4,10000015.txt,中国人,3256,粗い 格子 担架 担ぐ 行く 中国人 弁 髮 尻 辺り ぴんぴん 歩く 度 跳ね返す 見る,踏む 行く 庭 地面 石 拉ぐ 蒲公英 金色 裂く 月 半ば 室 目 転ずる


In [67]:
# print out the results to an excel file
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\SemanticsRace\LemmaConcordance.xlsx', engine='xlsxwriter')
concord_df.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [68]:
# grab the concordance data
concord_df = pd.read_excel(r'./LemmaConcordance.xlsx', sheet_name='Sheet1')
concord_df.shape

(6979, 5)

In [49]:
freq_table = freqs_df

# Get Collocate Frequencies for Each Seed Term

In [134]:
# Next step is to grab all left and right tokens in the data frame, separated out by seed_term
# Then produce counts of these context words

keyword = "土人"

#filter by race keyword
seed_df = concord_df[concord_df['seed_term'] == keyword]
seed_df = seed_df.reset_index(drop=True)

#filter by multiple race words
#seed_df = concord_df[(concord_df['seed_term'] == '中国人') | (concord_df['seed_term'] == '朝鮮人')]
#seed_df = seed_df.reset_index(drop=True)

collocates = []

for k in seed_df.index:
    context_words = ''
    if str(seed_df.left_tokens[k]) != 'nan':
        context_words += seed_df.left_tokens[k]
    context_words += seed_df.right_tokens[k]
    context_words = re.sub(r'\s+', ' ', context_words)
    
    collocates += re.split(r'\s', context_words)
    
#produce the frequency list
fdist = nltk.FreqDist(collocates)
freq_pairs = fdist.items()
sort_freq_pairs = sorted(freq_pairs, key=lambda x:x[1], reverse=True)  #sort by decreasing frequency

#create a dictionary to store word-frequency pairs
word_freqs = {}

#fill dictionary with pairs
for item in sort_freq_pairs:
    word_freqs[item[0]] = item[1]
    
freqs_df = DataFrame.from_dict(word_freqs, orient='index')  #convert dict to dataframe
freqs_df = freqs_df.rename(columns={0:'frequency'})    #rename column
freqs_df = freqs_df.sort_values(by='frequency', ascending=False)   #sort by frequency

#rough estimation of the total number of co-occurrences in co-occurency matrix given window size
total_collocates = token_count * (2 * window_size)   #total words * total window size

#compute relative frequencies
freqs_df['colloc_freq'] = freqs_df.frequency / total_collocates
freqs_df = freqs_df.reset_index()

freqs_df[0:10]

Unnamed: 0,index,frequency,colloc_freq
0,言う,380,6.574623e-07
1,土人,349,6.038272e-07
2,来る,236,4.083187e-07
3,人,235,4.065885e-07
4,彼,218,3.771758e-07
5,私-代名詞,184,3.183502e-07
6,行く,171,2.95858e-07
7,見る,156,2.699056e-07
8,―,138,2.387626e-07
9,等,137,2.370325e-07


In [76]:
# load your frequency table
freq_table = pd.read_excel(r'./LemmaFreqTable.xlsx', sheet_name='Sheet1')
freq_table[0:10]

# Calculate the PMI Scores

In [135]:
#create a new df to store results
new_df = freqs_df

#calclate the PMI scores for the selected seed term
def calc(row, keyword, alpha):
    a = .75  #set the alpha value
    term = row.values[0]
    w_c_prob = row.colloc_freq   #grab the probability of word given context; already stored in data frame
    if not freq_table[freq_table['index'] == term].empty:
        v1 = freq_table[freq_table['index'] == term].rel_freq.values[0]
    else:
        return 0
    if alpha == True:  #apply alpha parameter to give less value to rare words
        #take the raw count to the power of alpha
        temp1 = math.pow(freq_table[freq_table['index'] == keyword].frequency.values[0], a)
        #take the total word count to the power of alpha
        temp2 = token_count   #total number of tokens (calculated in advance)
        #divide the values
        v2 = temp1 / temp2
    else:
        v2 = freq_table[freq_table['index'] == keyword].rel_freq.values[0]
    
    return math.log2(w_c_prob / (v1 * v2)) #do the PMI calculation
    
new_df['score'] = new_df.apply(lambda x: calc(x, keyword, alpha=True), axis=1)

#now sort by score
new_df = new_df.sort_values(by = 'score', ascending=False)
new_df = new_df.reset_index(drop=True)

In [140]:
new_df[200:250]

Unnamed: 0,index,frequency,colloc_freq,score
200,サヴァイイ,2,3.460328e-09,9.666731
201,皮籠,2,3.460328e-09,9.666731
202,怒り罵る,1,1.730164e-09,9.666731
203,ロブ-lob,1,1.730164e-09,9.666731
204,伏波,1,1.730164e-09,9.666731
205,アハアハアハ,1,1.730164e-09,9.666731
206,カトレット,1,1.730164e-09,9.666731
207,博大,1,1.730164e-09,9.666731
208,タイサク,1,1.730164e-09,9.666731
209,戯謔,1,1.730164e-09,9.666731


In [142]:
df_list = [china_colloc, korea_colloc, black_colloc, buraku_colloc, western_colloc, native_colloc]
title_list = ['china','korea','black','buraku','western','native']

import xlsxwriter
import openpyxl

for i in range(len(df_list)):
    writer = pd.ExcelWriter(r'C:\Users\Hoyt\Dropbox\SemanticsRace\PMIResults\\' + title_list[i] + '.xlsx', 
                            engine='xlsxwriter')
    df_list[i].to_excel(writer, sheet_name='Sheet1')
    writer.save()

In [144]:
#test = native_colloc[native_colloc['frequency'] > 1]
test[50:100]

Unnamed: 0,index,frequency,colloc_freq,score
146,カツサン,2,3.460328e-09,10.403696
153,ショウドシマ,8,1.384131e-08,10.403696
154,又の名,2,3.460328e-09,10.403696
164,鼻音,2,3.460328e-09,10.403696
172,ジョン-John,60,1.038098e-07,10.202062
173,ヴァイリマ,2,3.460328e-09,10.181304
174,キシガミ,2,3.460328e-09,10.181304
175,トシヒロ,4,6.920656e-09,10.081768
179,流謫,2,3.460328e-09,9.988659
180,シル-sill,2,3.460328e-09,9.988659


In [11]:
#freq_table = pd.read_excel(r'C:\Users\Hoyt\Dropbox\SemanticsRace\FreqTable.xlsx', sheet_name='Sheet1')
v1 = freq_table[freq_table['index'] == freqs_df.ix[0][0]].rel_freq.values[0]
v2 = freq_table[freq_table['index'] == keyword].rel_freq.values[0]
v3 = freqs_df.ix[0].colloc_freq

print(v1)
print(v2)
print(v3)
print(math.log2(v3 / (v1 * v2)))

0.0578920130638
5.25996260335e-06
2.64634146341e-07
-0.20248954363594757


In [18]:
source_text = CORPUS_PATH + str(df.WORK_ID[2]) + ".txt"
raw_text = codecs.open(source_text, encoding="utf-8", errors="ignore")       #grab text
raw = raw_text.read()            
    
#do some preprocessing
raw = strip_chap_titles(raw)  #get rid of chapter titles and newline breaks; make dialogue markers consistent
raw = bracket_cleaner(raw)
#raw = cleaner(raw)

sents = re.findall(u'([^！？。(――)(——)\(\)]+(」を.*)*(」と[^。]*)*(」、と[^。]*)*(？」と[^。]*)*[！？。」(……)]*)', raw)    


#tokens = raw.split(' ')
#for token in reversed(tokens):  #remove blank spaces; faster way to do it
#        if token == '':
#            tokens.remove(token)
#token_counter = collections.Counter(tokens)
#sorted_counts = sorted(token_counter.items(), key=operator.itemgetter(1), reverse=True)