In [1]:
import pandas as pd

time_trigger = ['下午', '早晨', '晚上', '早上', '午夜', '今天', '明天']
time_trigger += [num + '點' for num in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']]

# Remove Japanese

```python
from collections import namedtuple
import re
Entry = namedtuple('Entry', ['title', 'contnet'])
diaries = pd.read_csv('data/diary_corpus.csv').fillna('')
diaries = diaries[diaries.author =='楊基振日記']
days = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日', '星期天']
#ls = []
out  = []
pat = re.compile('。')
for i in diaries.index:
    content = diaries.content[i].strip()
    title = diaries.title[i]
    for day in days:
        ind = content.find(day+'\n')
        if ind > 0:
            content = content[ind+3:]
            break
    out += [Entry(title, sentence.strip()) for sentence in pat.split(content) if sentence.strip()]
    #ls.append(len(content + title))
len(out)
```

# Extract Time Frames

```python
# 沒事不要再做一次 這個很久
'''
from jseg.jieba import Jieba
segmenter = Jieba()
'''
from PyCCS import ckip
segmenter = ckip
import json
from multiprocessing import Pool

def entry2frame(entry):
    title, sentence = entry
    if any(trigger in sentence for trigger in time_trigger):
        result = segmenter.seg(sentence)
        seg, pos = zip(*result.raw)
        return {'time': title, 
                'content': sentence,
                'segment': seg, 
                'postag': pos}
    else:
        return None

filename = 'data/frame_seg.jsonl'
pool = Pool()
frames = pool.imap_unordered(entry2frame, out, chunksize=4)
pool.close()
with open(filename, 'a') as f:
    f.seek(0)
    f.truncate()
    for i, frame in enumerate(frames, 1):
        if frame is not None:
            json.dump(frame, f, ensure_ascii=False)
            print(file=f)
        print('{:>5d}/{:>5d}'.format(i, len(out)), end='\r')
```

# Named Entities

- `names_twrsj.txt`: names from 台灣人士鑑

- `place_wiki.csv`: place from wiki

- `relatives.txt`: 到處收集的各種對人的稱呼

不使用 `names.csv` 和 `place.csv` 因為等一下用dictionary直接分就好了啊 OAO

In [2]:
names, relatives, places = set(), set(), set()

with open('data/names_twrsj.txt') as f:
    names.update(name.strip() for name in f)

with open('data/relatives.txt') as f:
    relatives.update(name.strip() for name in f)

with open('data/place_wiki.csv') as f:
    places.update(place.strip() for place in f)

# Raw Format

- `Na`: 普通名詞

- `Nb`: 專有名詞（含人名）

- `Nha`: 人稱代名詞    

- `Nc`: 位置

- `Nd`: 時間

In [3]:
with open('data/dict.txt.big', 'r') as f:
    common_words = set(line.split()[0] for line in f)

In [4]:
import json
from collections import defaultdict
with open('data/frame_seg.jsonl', 'r') as f:
    frames = [defaultdict(list, json.loads(frame)) for frame in f]

person_tag = 'Nb'
place_tag = 'Nc'
time_tag = 'Nd'

person_tag_soft = ('Na', 'Nha')

for frame in frames:
    sentence = zip(frame.pop('segment'), frame.pop('postag'))
    
    for idx, (token, pos) in enumerate(sentence):
        label = 'O'
        
        if token in time_trigger or pos.startswith(time_tag):
            label = 'TIME'
            frame['relative_time'].append({'token': token, 'index': idx})
            
        elif pos.startswith(person_tag) \
                or token in names and pos.startswith(person_tag_soft) \
                or (token in relatives) and pos.startswith('N') \
                or pos.startswith('Na') and token not in common_words:
            label = 'PERSON'
            frame['participant'].append({'token': token, 'index': idx})
            
        elif token in places or pos.startswith(place_tag):
            label = 'PLACE'
            frame['place'].append({'token': token, 'index': idx})
            
        frame['tokens'].append({'token': token, 'index': idx, 'pos': pos, 'label': label})

# Output Frames to json

In [5]:
import json
json.dump(frames, open('frames.json', 'w'), ensure_ascii=False)
print(len(frames))

4688


In [6]:
frames[37]

defaultdict(list,
            {'content': '過了下午四點，帶母親、肇嘉嫂、淑英等孩子們到天壇遊覽',
             'participant': [{'index': 6, 'token': '母親'},
              {'index': 8, 'token': '肇嘉嫂'},
              {'index': 10, 'token': '淑英'},
              {'index': 12, 'token': '孩子們'}],
             'place': [{'index': 14, 'token': '天壇'}],
             'relative_time': [{'index': 2, 'token': '下午'},
              {'index': 3, 'token': '四點'}],
             'time': ' 1944年（民33年，34歲）   10月1日\u3000日 ',
             'tokens': [{'index': 0, 'label': 'O', 'pos': 'VCL', 'token': '過'},
              {'index': 1, 'label': 'O', 'pos': 'Di', 'token': '了'},
              {'index': 2, 'label': 'TIME', 'pos': 'Nd', 'token': '下午'},
              {'index': 3, 'label': 'TIME', 'pos': 'Neu', 'token': '四點'},
              {'index': 4, 'label': 'O', 'pos': 'COMMACATEGORY', 'token': '，'},
              {'index': 5, 'label': 'O', 'pos': 'VC', 'token': '帶'},
              {'index': 6, 'label': 'PERSON', 'pos': 'Na', 'token': '母親'},
 

In [7]:
frames[2]

defaultdict(list,
            {'content': '晚上姊姊、肇嘉嫂一行與賴太太、毛昭江君等人來訪，熱熱鬧鬧地度過西黃域根的最後一夜',
             'participant': [{'index': 1, 'token': '姊姊'},
              {'index': 3, 'token': '肇嘉嫂'},
              {'index': 7, 'token': '賴'},
              {'index': 10, 'token': '毛昭江'},
              {'index': 11, 'token': '君'},
              {'index': 19, 'token': '西黃域根'}],
             'relative_time': [{'index': 0, 'token': '晚上'},
              {'index': 21, 'token': '最後'},
              {'index': 23, 'token': '夜'}],
             'time': ' 1944年（民33年，34歲）   10月8日\u3000日 ',
             'tokens': [{'index': 0,
               'label': 'TIME',
               'pos': 'Nd',
               'token': '晚上'},
              {'index': 1, 'label': 'PERSON', 'pos': 'Na', 'token': '姊姊'},
              {'index': 2, 'label': 'O', 'pos': 'PAUSECATEGORY', 'token': '、'},
              {'index': 3, 'label': 'PERSON', 'pos': 'Na', 'token': '肇嘉嫂'},
              {'index': 4, 'label': 'O', 'pos': 'Neu', 'token': '一'},
 