In [1]:
#### imports

import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy import stats
import os
import codecs
import sys
import re
from shutil import copyfile
from __future__ import division
#from __future__ import print_function
#sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
#sys.stdin = codecs.getreader('utf_8')(sys.stdin)
import MeCab  #CHECK "MECABRC" FILE TO SEE WHICH DICTIONARY YOU ARE USING
mecab = MeCab.Tagger("")  #using unidic
#mecab = MeCab.Tagger("-Ochasen")  #using MeCab's ipadic
import collections
import operator
import nltk
import math
from sklearn.metrics import jaccard_similarity_score

In [2]:
#some cleaning and pre-processing functions for Japanese corpus

#this first one should only be used for non-tokenized texts; basically cleans them for tokenization step
def strip_chap_titles(raw):
    #get rid of chapter titles that use Chinese numbers with or without surronding parantheses
    raw = re.sub(r'（*([一二三四五六七八九十])+(）)*\n', '', raw)
    #get rid of chapter titles that use utf-8 alpha-numeric numbers
    raw = re.sub(r'[１-９]+\n', '', raw)
    raw = re.sub(r'[第弐拾章参壱一二三四五六七八九十]+\n', '', raw)
    raw = re.sub(r'『', r'「', raw)   #replace all 『 with 「
    raw = re.sub(r'』', r'」', raw)   #replace all 』 with 」
    raw = re.sub(r'\n', '', raw)  #strips all newlines
    raw = re.sub(r'\r', '', raw)  #strips all returns
    #raw = re.sub(r'\s', '', raw)
    return raw

puncs = ['、','。','「','」','…','！','――','？','ゝ','『','』','（','）','／','＼','々','ーーー','］','・','ゞ','［','<','〔','〕',
         '＃','△','※','＊']

def cleaner(text):
    for punc in puncs:
        text = re.sub(punc, '', text)
    text = re.sub(r'\s+', ' ', text)                         #get rid of double spaces
    return text

def remove_stopwords(tokens, stopwords):
    new_list = [token for token in tokens if token not in stopwords]
    return new_list

def get_stopwords(path):
    f = open(path, encoding='utf-8')
    words = f.read()
    return re.split(r'\n', words)

In [3]:
#LOAD in metadata for the kindai zasshi corpus

df = pd.read_excel(r'Kindai_Meta.xlsx', sheetname='Sheet1')
#df = df[df['YEAR'] < 1960]
#df = df[df['YEAR'] > 1875]
df = df[df['FILTER'] != 'YES']

df.shape

(8867, 9)

In [4]:
df[0:10]

Unnamed: 0,FILE_ID,MAGAZINE,YEAR,TITLE,LENGTH,FILTER,GEO_WORDS
12373,12373.txt,国民之友,1887,01-001_皇帝陛下の西京行幸_文語_b,212,,0.0
12374,12374.txt,国民之友,1887,01-002_長崎事件_文語_b,313,,3.0
12375,12375.txt,国民之友,1887,01-003_地方制度の改良_文語_b,507,,0.0
12376,12376.txt,国民之友,1887,01-004_畝傍艦何の辺に漂ふ_文語_b,131,,1.0
12377,12377.txt,国民之友,1887,01-005_国会議事堂_文語_b,100,,0.0
12378,12378.txt,国民之友,1887,01-006_二十三年博覧会_文語_b,109,,0.0
12379,12379.txt,国民之友,1887,01-007_仏教の末路_文語_b,131,,0.0
12380,12380.txt,国民之友,1887,01-008_商業世界の波瀾_文語_b,435,,0.0
12381,12381.txt,国民之友,1887,01-009_鉄道事業_文語_b,245,,0.0
12382,12382.txt,国民之友,1887,01-010_斉東野人の語に非らざる乎_文語_b,251,,0.0


# Get Significant Clusters and Race Terms

In [16]:
#if you're working on unidic tokenized corpus
CORPUS_PATH = r"C:\Users\Hoyt\Dropbox\SemanticsRace\KindaiLemmaMerge\\"

race_term = '中国人'
atom_terms = ['言う','思う','考える'] #,'知る','分かる']

#race_term = '西洋人'
#atom_terms =['目','顔','笑う','様子','じっと','頬','見詰める','表情','唇','微笑','顔色','眉','瞳','見上げる','目付き','顔付き']
#atom_terms = ['歩く','遊ぶ','散歩']

#race_term = '日本人'
#atom_terms = ['精神','道徳','観念','倫理','道義'] 

#race_term = '土人'
#atom_terms = ['写真']
#atom_terms = ['寝る','眠る','床','起きる','布団','寝']
#atom_terms = ['雨','降る','雪']

window_size = 20
rmv_stopwords = False

if rmv_stopwords == True:
    stopwords = get_stopwords(r'C:\Users\Hoyt\Dropbox\SemanticsRace\stopwords.txt')

#prepare file to print out results

#f = open(r"C:\Users\Hoyt\Dropbox\SemanticsRace\Results\KindaiChinesePoem.txt", "w", encoding="utf-8")


#set up data structure to hold info
metadata = {'text_id': [], 'race_term': [], 'index_pos': [], 'left_tokens': [], 'right_tokens': []}

hits = 0

#iterate through all texts and find instances where any atom_terms appears within 20 characters of race term
for k in df.index:
    current_position = 0
    
    #get the text
    text_id = str(df.FILE_ID[k])
    title = df.TITLE[k]
    magazine = df.MAGAZINE[k]
    year = df.YEAR[k]
    source_text = CORPUS_PATH + str(df.FILE_ID[k])
    raw_text = open(source_text, encoding="utf-8")       #grab text
    text = raw_text.read()
    
    #remove punctuation
    text = cleaner(text)
    
    #tokenize the text
    tokens = re.split(r' ', text)
    
    #remove stopwords
    if rmv_stopwords == True:
        tokens = remove_stopwords(tokens, stopwords)
    
    #iterate through tokens in search of seed_terms
    for token in tokens:
        #check for seed term
        if token == race_term:
            #get words on either side of race term
            
            #check to make sure we are not at start of text
            if current_position - window_size > 0:
                #left_tokens = ' '.join(tokens[current_position-window_size:current_position])
                left_tokens = tokens[current_position-window_size:current_position]
            else:
                #left_tokens = ' '.join(tokens[0:current_position])
                left_tokens = tokens[0:current_position]
            
            #check to make sure we are not at end of text
            if current_position + window_size < len(tokens):
                #right_tokens = ' '.join(tokens[current_position+1:current_position+window_size])
                right_tokens = tokens[current_position+1:current_position+window_size]
            else:
                #right_tokens = ' '.join(tokens[current_position+1:])
                right_tokens = tokens[current_position+1:]
                
            #check to see if any atom words are in left and right tokens
            for word in atom_terms:
                if word in left_tokens or word in right_tokens:
                    metadata['text_id'].append(text_id)
                    metadata['race_term'].append(race_term)
                    metadata['index_pos'].append(current_position)
                    metadata['left_tokens'].append(left_tokens)
                    metadata['right_tokens'].append(right_tokens)
                    
#                     #print the results to a file
#                     f.write(text_id + ' ' + magazine + ' ' + str(year) + ' ' + str(title) + ' ' + word + '\n')
#                     f.write(' '.join(left_tokens) + ' ' + race_term + ' ' + ' '.join(right_tokens) + '\n\n')
                    
#                     #try to grab the corresponding passage in non-lemmatized version of text
#                     CORPUS_DIR = r"C:\Users\Hoyt\Documents\MyData\SemanticsRace\kindai_original_raw\\"
#                     #merge target passage into a single string
#                     target_passage = ' '.join(left_tokens) + ' ' + race_term + ' ' + ' '.join(right_tokens)
#                     target_passage = re.split(r' ', target_passage)
                    
#                     #open the corresponding tokenized text
#                     source = CORPUS_DIR + str(df.FILE_ID[k])
#                     raw = open(source, encoding="utf-8")       #grab text
#                     new_text = raw.read()
                    
#                     sim_scores = {}
                    
#                     #tokenize the text
#                     new_tokens = re.split(r' ', new_text)
                    
#                     sliding_window_size = 2 * window_size
                    
#                     #calcuate similarity across all passages of a given window size
#                     for i in range(len(new_tokens) - (sliding_window_size - 1)):
#                         source_passage = new_tokens[i:i+sliding_window_size]
#                         j_score = jaccard_similarity_score(target_passage, source_passage)
#                         sim_scores[i] = j_score
                        
#                     #now rank by similarity score
#                     sim_df = DataFrame.from_dict(sim_scores, orient='index')  #convert dict to dataframe
#                     sim_df = sim_df.rename(columns={0:'j_score'})    #rename column
#                     sim_df = sim_df.sort_values(by='j_score', ascending=False)   #sort by frequency
#                     #get top most similar passage
#                     top_sim_index = sim_df.index[0:1].tolist()[0]           
#                     #grab the corresponding window (plus 20 words before and after)
#                     matching_passage = new_tokens[top_sim_index - 20:top_sim_index + 60]
#                     f.write('---' + ''.join(matching_passage) + '\n\n')
                    
                    hits+=1
                    break
                    
        current_position += 1

f.close()
print(hits)

350


In [8]:
hits

7

In [10]:
target_passage

['まで',
 '見通す',
 '様',
 'だ',
 '笑う',
 '方',
 'を',
 '為る',
 'か',
 'だ',
 '無い',
 'ば',
 '例',
 'の',
 'にやにや',
 '笑い',
 'だ',
 '有る',
 '此れ',
 'も',
 '西洋人',
 'に',
 'は',
 '分かる',
 'ない',
 '態度',
 'らしい',
 'が',
 '私',
 'の',
 '所',
 'へ',
 '舞い込む',
 'て',
 '来る',
 '葉書',
 'など',
 'に',
 'も',
 '返事']

In [None]:
"""
                    #try to grab the corresponding passage in non-lemmatized version of text
                    CORPUS_DIR = r"C:\Users\Hoyt\Dropbox\JapanCorpusTokenized\\"
                    #merge target passage into a single string
                    target_passage = left_tokens + ' ' + race_term + ' ' + right_tokens
                    target_passage = re.split(r' ', target_passage)
                    
                    #open the corresponding tokenized text
                    source = CORPUS_DIR + str(df.WORK_ID[k]) + ".txt"
                    raw = open(source, encoding="utf-8")       #grab text
                    new_text = raw.read()
    
                    sim_scores = {}
                    
                    new_text = bracket_cleaner(new_text)  #clean brackets for non-unicode kanji; need to do this first
                    #remove punctuation
                    #new_text = cleaner(new_text)
    
                    #tokenize the text
                    new_tokens = re.split(r' ', new_text)
                    
                    sliding_window_size = 2 * window_size
                    
                    #calcuate similarity across all passages of a given window size
                    for i in range(len(new_tokens) - (sliding_window_size - 1)):
                        source_passage = new_tokens[i:i+sliding_window_size]
                        j_score = jaccard_similarity_score(target_passage, source_passage)
                        sim_scores[i] = j_score
    
                    #now rank by similarity score
                    sim_df = DataFrame.from_dict(sim_scores, orient='index')  #convert dict to dataframe
                    sim_df = sim_df.rename(columns={0:'j_score'})    #rename column
                    sim_df = sim_df.sort_values(by='j_score', ascending=False)   #sort by frequency
                    #get top most similar passage
                    top_sim_index = sim_df.index[0:1].tolist()[0]           
                    #grab the corresponding window (plus 20 words before and after)
                    matching_passage = new_tokens[top_sim_index - 20:top_sim_index + 60]
                    f.write('---' + ''.join(matching_passage) + '\n\n')
                    
                    hits += 1
                    break
                    """