In [1]:
import os
import re
import random
import gensim
import argparse
from utils import *



In [2]:
def punc_cleaner(raw):
    raw = re.sub(r'[一二三四五六七八九十]+[百千万]*', '', raw)   #replace all numbers
    raw = re.sub(r'[×‥・○◎]+', '', raw)   #replace punctuation marks
    raw = re.sub(r'[Ａ-Ｚ]+', '', raw)     #replace full-width letters
    raw = re.sub(r'\s+', ' ', raw)     #get rid of double spaces
    return raw

def name_cleaner(text, proper_names):
    tokens = re.split(r'\s', text)
    targets = set(re.findall(r'[ァ-ヺ]+', raw)) #extract all katakana words
    
    #if it's in our proper name list then delete all instances
    for target in targets:
        if target in proper_names:
            tokens = list(filter(lambda a: a != target, tokens))
    
    text = ' '.join(tokens)           #get rid of double spaces    
    return text

def periods(text):
    #count periods
    periods = re.findall(r'。', text)
    
    #tokenize text
    text = re.split(r'\s', text)
    
    return len(periods)/len(text)

def get_proper_names():
    proper_names = []
    with open("./WordLists/ProperNames.txt", 'r', encoding = "utf-8") as f:
        for word in f.readlines():
            proper_names.append(word.replace("\n", ""))
    return proper_names

In [6]:
corpus_path = './KindaiLemmaMerge'
files = [file for file in os.listdir(corpus_path) if file.endswith('.txt')]


len(files)

14464

In [7]:
df = pd.read_excel(r'./Kindai_Meta.xlsx', sheetname='Sheet1')
df = df[df['YEAR'] < 1960]
df = df[df['YEAR'] > 1875]
df.shape

(14464, 6)

In [11]:
#pre-process texts and stick them in new folder
input_path = './KindaiLemmaMerge'
corpus_path = './KindaiW2VPreProcessed'
input_files = [file for file in os.listdir(input_path) if file.endswith('.txt')]
proper_names = get_proper_names()

for i in range(len(input_files)):
    file = input_files[i]
    with open(os.path.join(input_path, file), 'r', encoding = 'utf-8') as f:
        raw = f.read()
        raw = punc_cleaner(raw)
        raw = name_cleaner(raw, proper_names)  #strip proper names
        with open(os.path.join(corpus_path, file), 'w', encoding="utf-8") as g:
            g.write(raw)
        g.close()

In [2]:
#clean the word2vec output to replace spaces with the word SPACE
model_dir = './results_fic_bootstrap/word2vec_bootstrap'
boot_files = [file for file in os.listdir(model_dir) if file.endswith('.txt')]

for i in range(len(boot_files)):
    file = boot_files[i]
    with open(os.path.join(model_dir, file), 'r', encoding = 'utf-8') as f:
        raw = f.read()
        if re.findall(r'\n -', raw):
            raw = re.sub(r'\n -', r'\nSPACE -', raw)
            with open(os.path.join(model_dir, file), 'w', encoding='utf-8') as g:
                g.write(raw)
            g.close()
        elif re.findall(r'\n 0', raw):
            raw = re.sub(r'\n 0', r'\nSPACE 0', raw)
            with open(os.path.join(model_dir, file), 'w', encoding='utf-8') as g:
                g.write(raw)
            g.close()