In [1]:
import pandas as pd

# Named Entities

- `names_twrsj.txt`: names from 台灣人士鑑

- `place_wiki.csv`: place from wiki

- `relatives.txt`: 到處收集的各種對人的稱呼

*__不使用 `names.csv` 和 `place.csv`__* 因為等一下用dictionary直接分就好了啊 OAO

In [2]:
names, relatives, places, time_trigger = set(), set(), set(), set()

with open('data/names_twrsj.txt') as f:
    names.update(name.strip() for name in f)

with open('data/relatives.txt') as f:
    relatives.update(name.strip() for name in f)

with open('data/place_wiki.csv') as f:
    places.update(place.strip() for place in f)


nums = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '兩']
time_trigger.update(['下午', '早晨', '晚上', '早上', '午夜', '今天', '明天'])
time_trigger.update(num + '點' for num in nums)

# Remove Japanese

```python
import re
diaries = pd.read_csv('data/diary_corpus.csv').fillna('')
diaries = diaries[diaries.author =='楊基振日記']
days = ['星期一', '星期二', '星期三', '星期四', '星期五', '星期六', '星期日', '星期天']
out  = []
pat = re.compile('。')
for i in diaries.index:
    content = diaries.content[i].strip()
    title = diaries.title[i]
    for day in days:
        ind = content.find(day+'\n')
        if ind > 0:
            content = content[ind+3:]
            break
    out += [title, sentence.strip() for sentence in pat.split(content) if sentence.strip()]
len(out)
```

# Extract Time Frames

```python
# 沒事不要再做一次 這個很久
'''
from jseg.jieba import Jieba
segmenter = Jieba()
'''
from PyCCS import ckip
segmenter = ckip
import json
from multiprocessing import Pool

def entry2frame(entry):
    title, sentence = entry
    if any(trigger in sentence for trigger in time_trigger):
        result = segmenter.seg(sentence)
        seg, pos = zip(*result.raw)
        return {'time': title, 
                'content': sentence,
                'segment': seg, 
                'postag': pos}
    else:
        return None

filename = 'data/frame_seg.jsonl'
pool = Pool()
frames = pool.imap_unordered(entry2frame, out, chunksize=4)
pool.close()
with open(filename, 'a') as f:
    f.seek(0)
    f.truncate()
    for i, frame in enumerate(frames, 1):
        if frame is not None:
            json.dump(frame, f, ensure_ascii=False)
            print(file=f)
        print('{:>5d}/{:>5d}'.format(i, len(out)), end='\r')
```

# Raw Format

- `Na`: 普通名詞

- `Nb`: 專有名詞（含人名）

- `Nha`: 人稱代名詞    

- `Nc`: 位置

- `Nd`: 時間

In [3]:
import json
with open('data/dict.txt.big', 'r') as f:
    common_words = set(line.split()[0] for line in f)
with open('data/frame_seg.jsonl', 'r') as f:
    frames = [json.loads(frame) for frame in f]

In [4]:
person_tag = 'Nb'
place_tag = 'Nc'
time_tag = 'Nd'

person_tag_soft = ('Na', 'Nha')

entities = 'relative_time place participant tokens'.split()
for frame in frames:
    frame.update({entity: [] for entity in entities})
    sentence = zip(frame.pop('segment'), frame.pop('postag'))
    
    for idx, (token, pos) in enumerate(sentence):
        label = 'O'
        
        if pos.startswith(time_tag) \
                or token in time_trigger:                
            label = 'TIME'
            frame['relative_time'].append({'token': token, 'index': idx})
            
            
        elif pos.startswith(place_tag) \
                or token in places:                
            label = 'PLACE'
            frame['place'].append({'token': token, 'index': idx})
            
            
        elif pos.startswith(person_tag) \
                or pos.startswith(person_tag_soft) and token     in names \
                or pos.startswith('N')             and token     in relatives \
                or pos.startswith('Na')            and token not in common_words:                    
            label = 'PERSON'
            frame['participant'].append({'token': token, 'index': idx})
            
            
        frame['tokens'].append({'token': token, 'index': idx, 'pos': pos, 'label': label})

# Output Frames to json

In [5]:
import json
with open('frames.jsonl', 'w') as f:
    for frame in frames:
        print(json.dumps(frame, ensure_ascii=False), file=f)
# I still need json QQ
with open('frames.json', 'w') as f:
    print(json.dumps(frames, ensure_ascii=False, indent=2), file=f)
print(len(frames))

4688


In [6]:
frames[37]

{'content': '過了下午四點，帶母親、肇嘉嫂、淑英等孩子們到天壇遊覽',
 'participant': [{'index': 6, 'token': '母親'},
  {'index': 8, 'token': '肇嘉嫂'},
  {'index': 10, 'token': '淑英'},
  {'index': 12, 'token': '孩子們'}],
 'place': [{'index': 14, 'token': '天壇'}],
 'relative_time': [{'index': 2, 'token': '下午'}, {'index': 3, 'token': '四點'}],
 'time': ' 1944年（民33年，34歲）   10月1日\u3000日 ',
 'tokens': [{'index': 0, 'label': 'O', 'pos': 'VCL', 'token': '過'},
  {'index': 1, 'label': 'O', 'pos': 'Di', 'token': '了'},
  {'index': 2, 'label': 'TIME', 'pos': 'Nd', 'token': '下午'},
  {'index': 3, 'label': 'TIME', 'pos': 'Neu', 'token': '四點'},
  {'index': 4, 'label': 'O', 'pos': 'COMMACATEGORY', 'token': '，'},
  {'index': 5, 'label': 'O', 'pos': 'VC', 'token': '帶'},
  {'index': 6, 'label': 'PERSON', 'pos': 'Na', 'token': '母親'},
  {'index': 7, 'label': 'O', 'pos': 'PAUSECATEGORY', 'token': '、'},
  {'index': 8, 'label': 'PERSON', 'pos': 'Na', 'token': '肇嘉嫂'},
  {'index': 9, 'label': 'O', 'pos': 'PAUSECATEGORY', 'token': '、'},
  {'index': 1