In [1]:
import pandas as pd
from jseg.jieba import Jieba
j = Jieba()

diaries = pd.read_csv('data/diary_corpus.csv').fillna('')
diaries = diaries[diaries.author=='楊基振日記']

DEBUG:jseg.jieba:loading default dictionary


# Remove Japanese

In [2]:
diaries = pd.read_csv('data/diary_corpus.csv').fillna('')
diaries = diaries[diaries.author =='楊基振日記']
days = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日', '星期天']
ls = []
out  = []
for i in diaries.index:
    content = diaries.content[i]
    title = diaries.title[i]
    for day in days:
        ind = content.find(day+'\n')
        if ind > 0:
            content = content[ind+3:]
            break
    ls.append(len(content + title))
    out.append({'title': title, 'content': content.split('。')})

# Named Entities by CJ

In [3]:
with open('data/names.csv') as f:
    names = set(name.strip() for name in f)
with open('data/names_twrsj.txt') as f:
    names.update(name.strip() for name in f)

with open('data/relatives.txt') as f:
    relatives = set(name.strip() for name in f)
    

with open('data/place.csv') as f:
    places = set(place.strip() for place in f)
with open('data/place_wiki.csv') as f:
    places.update(place.strip() for place in f)

# Extract Time Frames

In [4]:
time_trigger = ['下午', '早晨', '晚上', '早上', '午夜', '今天', '明天'] + [num + '點' for num in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二']]
frames = []
for o in out:
    for content in o['content']:
        for trigger in time_trigger:
            if trigger in content:
                frames.append({'time': o['title'], 'content': content})
                break

# Raw Format
- person
    - `Nb`: 專有名詞（含人名）
    - `Nha`: 人稱代名詞    
    - `NN`: 不知道是啥但很多
- place
    - `Nc`: 位置
- time
    - `Nd`: 時間

In [11]:
person_tag = 'Nb'
place_tag = 'Nc'
time_tag = 'Nd'
person_tag_strict = ('NN', 'Nha')
for frame in frames:
    result = j.seg(frame['content'], pos=True)
    frame['participant'] = []
    frame['place'] = []
    frame['relative_time'] = []
    frame['tokens'] = []
    for idx, (token, pos) in enumerate(result.raw):
        label = 'O'
        if token in time_trigger or pos.startswith(time_tag):
            label = 'TIME'
            frame['relative_time'].append({'token': token, 'index': idx})
        elif pos.startswith(person_tag) \
                or (token in names and pos.startswith(person_tag_strict)) \
                or (token in relatives) and pos.startswith('N'):
            label = 'PERSON'
            frame['participant'].append({'token': token, 'index': idx})
        elif token in places or pos.startswith(place_tag):
            label = 'PLACE'
            frame['place'].append({'token': token, 'index': idx})
            
        frame['tokens'].append({'token': token, 'index': idx, 'pos': pos, 'label': label})

# Output Frames to json

In [12]:
import json
json.dump(frames, open('frames.json', 'w'), ensure_ascii=False)
print(len(frames))

4688


In [13]:
frames[1] # After adding 肇嘉嫂 淑英 天壇 to CJ's entities

{'content': '過了下午四點，帶母親、肇嘉嫂、淑英等孩子們到天壇遊覽',
 'participant': [{'index': 6, 'token': '母親'},
  {'index': 8, 'token': '肇嘉嫂'},
  {'index': 10, 'token': '淑英'}],
 'place': [{'index': 14, 'token': '天壇'}],
 'relative_time': [{'index': 2, 'token': '下午'}, {'index': 3, 'token': '四點'}],
 'time': ' 1944年（民33年，34歲）   10月1日\u3000日 ',
 'tokens': [{'index': 0, 'label': 'O', 'pos': 'VC1', 'token': '過'},
  {'index': 1, 'label': 'O', 'pos': 'Di', 'token': '了'},
  {'index': 2, 'label': 'TIME', 'pos': 'Ndabe', 'token': '下午'},
  {'index': 3, 'label': 'TIME', 'pos': 'Ndabe', 'token': '四點'},
  {'index': 4, 'label': 'O', 'pos': 'NN', 'token': '，'},
  {'index': 5, 'label': 'O', 'pos': 'VC32', 'token': '帶'},
  {'index': 6, 'label': 'PERSON', 'pos': 'Nab', 'token': '母親'},
  {'index': 7, 'label': 'O', 'pos': 'Caa', 'token': '、'},
  {'index': 8, 'label': 'PERSON', 'pos': 'NN', 'token': '肇嘉嫂'},
  {'index': 9, 'label': 'O', 'pos': 'Caa', 'token': '、'},
  {'index': 10, 'label': 'PERSON', 'pos': 'NN', 'token': '淑英'},
  {'i