In [8]:
import sys
import re
import os
from docx import Document
import pandas as pd
from IPython.display import display, HTML

class CorpusProcessor:
    def __init__(self, path='.'):
        self.result = []
        self.path = path
        self._process_path(self.path)
        
    def regex_search(self, query_word):
        """
        query_word: <Str>
            要搜索的字串，此搜索字串會先丟入re.compile()
            因此可以接收regex string
            例如此參數可以輸入"(LOC|INS)"
            則可以給出所有標有LOC或INS的語料。
        """

        pattern = re.compile(query_word)
        r = self._query_keyword(pattern)
        self._display_as_dataframe(r)
    
                    
    def easy_search(self):
        result = {}
        
        print('請輸入想要搜尋的字串:')
        print('如果想搜尋多個字串，請在字串之間以逗號分隔，例如: "主格,i,媽媽"')
        print('如果想找的i為單獨出現的i，而非出現在某單詞中的i，請在i的兩邊加上空格')
        
        x = str(input("請輸入想要查找字詞："))
        
        query_words = [word for word in x.split(',')]
        
        if x=="e" or x=="E":
            return
        else:
            for file in self.result:
                result[file["file_name"]] = [sentence for sentence in file["data"] if all([re.search(re.compile(word), str(sentence)) for word in query_words])]
            
            self._display_as_dataframe(result)

    def search_note(self):
        result = {}
        
        print('請輸入想要搜尋#note當中的字串:')
        
        x = str(input("請輸入想要查找字詞："))
        
#         query_words = [word for word in x.split(',')]
        
        if x=="e" or x=="E":
            return
        else:
            for file in self.result:
                result[file["file_name"]] = []
                for sentence in file["data"]:
                    l = sentence['content'].split('#n')
                    if len(l) == 2 and re.search(re.compile(x), l[1]):
                        result[file["file_name"]].append(sentence)
            
            self._display_as_dataframe(result)

    def all_data(self):
        return self.result
            
    def _display_as_dataframe(self, r):
        for k, v in r.items():
            print(f"<< {k} >>")
            if len(v) == 0:
                print("本週無相關資料")
            else:
                for item in v:
                    n = item['num']
                    c = item['content']
                    cc = c.split('\n')
                    cc = list(filter(lambda x: x!='', cc))
                    display(pd.DataFrame([cc[0].split(), cc[1].split(), cc[2].split()]).rename({0: "泰雅：", 1: "英文：", 2: "中文："}, axis='index')) 
                    try:
                        print("[英文翻譯]")
                        print([ccc for ccc in cc if ccc.startswith("#e")][0])
                        print("[中文翻譯]")
                        print([ccc for ccc in cc if ccc.startswith("#c")][0])
                        print("[註釋]")
                        print([ccc for ccc in cc if ccc.startswith("#n")][0])
                    except:
                        pass
    
    def _process_path(self, p):
        for filename in os.listdir(p):
            if not filename.startswith("~") and (filename.endswith(".docx") or filename.endswith(".doc")):
                document = self._open_file(os.path.join(p, filename))
                self.result.append(self._process(document, filename))

    def _open_file(self, name):
        d = Document(name)
        return d
    
    
    def _process(self, doc, filename):
        result = {
            "file_name": filename,
            "data": []
        }
        all_p = doc.paragraphs
        num_re = re.compile("(\d{1,2})\.")

        num = 0
        current_index = -1
        # rrr = map(lambda x: re.match(num_re, x.text), all_p)
        start = False
        num_re = re.compile("(\d{1,2})\.")
        for p in all_p:
            match = re.match(num_re, p.text)

            if match:
                start = True
                num = match.group(1)
                result["data"].append({"num": num, "content": ""})
                current_index = len(result["data"]) - 1
            else:
                if not start:
                    pass
                else:
                    result["data"][current_index]["content"] += "\n" + p.text

        return result
    
    def _query_keyword(self, q):
        result = {}

        for file in self.result:
            result[file["file_name"]] = [sentence for sentence in file["data"] if re.search(q, sentence["content"])]

        return result


In [9]:
p = CorpusProcessor(path='./corp')
#c.search_note()
p.easy_search()

請輸入想要搜尋的字串:
如果想搜尋多個字串，請在字串之間以逗號分隔，例如: "主格,i,媽媽"
如果想找的i為單獨出現的i，而非出現在某單詞中的i，請在i的兩邊加上空格


請輸入想要查找字詞： aaaaaaaaa


<< 20200325.docx >>
本週無相關資料
<< 20200408.docx >>
本週無相關資料
<< 20200325_n.docx >>
本週無相關資料
<< 20200318.docx >>
本週無相關資料


In [67]:
#result[file["file_name"]] = [sentence for sentence in file["data"] if all([re.search(re.compile(word), str(sentence)) for word in query_words])]
lines = p.result[0]['data'][0]['content'].strip().split('\n')

gloss_lines = lines[:-3]
free_lines = lines[-3:]

# Concat gloss lines to three lines

In [10]:
for doc_id, doc in enumerate(p.result):
    for gloss_id in range(len(doc)):
        lines = p.result[doc_id]['data'][gloss_id]['content'].strip().split('\n')
        gloss_lines = lines[:-3]
        free_lines = lines[-3:]

        # 3*n + n
        gloss_lines = lines[:-3]
        num_of_lines = len(gloss_lines) 

        if num_of_lines % 4 != 0:
            print(p.result[doc_id]['file_name'], doc_id, gloss_id)
            raise Exception("len(gloss_lines) should be multiples of 4.")

        # Concat multiple lines to three
        rk_gloss = ''
        en_gloss = ''
        zh_gloss = ''
        for i in range(int(num_of_lines / 4)):
            rk_gloss += gloss_lines[0 + i * 4] + '\t'
            en_gloss += gloss_lines[1 + i * 4] + '\t'
            zh_gloss += gloss_lines[2 + i * 4] + '\t'

        rk_gloss = rk_gloss.strip().split()
        en_gloss = en_gloss.strip().split()
        zh_gloss = zh_gloss.strip().split()

        # Tokenize
        gloss = []
        en_len = len(en_gloss)
        zh_len = len(zh_gloss)
        for i, rk in enumerate(rk_gloss):

            if not i < en_len:
                en = '_'
            else:
                en = en_gloss[i]
            if not i < zh_len:
                zh = '_'
            else:
                zh = zh_gloss[i]

            gloss.append( (rk, en, zh) )

        # Save data
        p.result[doc_id]['data'][gloss_id]['content_parsed'] = {
            'gloss': gloss,
            'free': free_lines
        }

20200408.docx 1 0


Exception: len(gloss_lines) should be multiples of 4.

In [69]:
#p.result[2]
d = Document("corp/20200325_n.docx")
d2 = Document("corp/20200325.docx")

process_doc("corp/20200408.docx")

[(1,
  ['kay\t\tElrenge\t\ttakane',
   'this\t\tElrenge\t\tolder_sister',
   '這\t\tElrenge\t\t兄姐',
   '',
   '#e Is Elrenge my older sister?',
   '#c Elrenge 是我的姊姊嗎？',
   '#n Not sure about last word.']),
 (2,
  ['ngia-~ka~kane-aku\t\t\tki\t\taga',
   'oneself-eat~ka~eat-1SG\t\tOBL\tmeal',
   '自己-吃~ka~吃-我 \t\t\t斜格\t飯.餐',
   '',
   '#e I eat (a meal) myself.',
   '#c 我自己用餐',
   '#n']),
 (3,
  ['kay\tElrenge\tngia-u~dulri~dulri',
   'this\tElrenge\toneself-跳舞~正在~跳舞',
   '這\tElrenge\t自己-跳舞~PROG~跳舞',
   ' ',
   '#e Elrenge is dancing by herself',
   '#c Elrenge 自己在跳舞']),
 (4,
  ['kay\tElrenge\tngia-udulri',
   'this\tElrenge\tonself-dance',
   '這\tElrenge\t自己-跳舞',
   ' ',
   '#e Elrenge dance alone (when alone).',
   '#c Elrenge (會)自己跳舞',
   '#n Contrast with 3 in tense']),
 (5,
  ['kay \tElrenge\tngia-drakale \t\tki\t\tpitu\t\tki\t\ttuki\t',
   'this\tElrenge\toneself-get_up\t\tOBL\tseven\tOBL\tclock',
   '這\tElrenge\t自己-起來 \t\t斜格\t七\t\t斜格\t時鐘',
   '',
   'luka\t\tmialrealre',
   'at\t\tm

In [68]:
def process_doc(fp="corp/20200325.docx"):
    
    # Load document
    d = Document(fp)
    a_doc = '\n'.join(p.text for p in d.paragraphs)  # normalize paragraphs to newlines
    
    # Find the places of glosses
    pat_start = re.compile("^(\d{1,2})\.")
    a_doc_split = a_doc.split('\n')
    glosses_on = []
    gloss_num_old = None
    for i, line in enumerate(a_doc_split):

        if pat_start.match(line):
            gloss_num_new = i

            # Save gloss range
            if gloss_num_old is not None:
                glosses_on.append( (gloss_num_old, gloss_num_new - 1) )
            gloss_num_old = gloss_num_new

    # Get all glosses
    glosses = []
    for s, e in glosses_on:
        gloss_num = int(a_doc_split[s][:-1])
        gloss_lines = a_doc_split[(s+1):e]
        glosses.append( (gloss_num, gloss_lines) )
    
    return glosses

In [67]:
glosses

[(1,
  ['yakay\tku\ttatulru\t(ku\tababay/sauvalay)\tku',
   'have\t\t\tthree\t\tfemale/male',
   '有\t\t\t3\t\t女性/男性\t\t\t\t',
   '\t\t',
   'agi-li',
   'yonger_brother/sister-1SG.POSS',
   '弟妹-我的.第一人稱單數.所有格',
   '',
   '#e I have 3 younger brother/sister',
   '#c 我有 3 個弟弟/妹妹',
   '#n  yakay\tku 可省略']),
 (2,
  ['ka\t\tLaucu\ttakalra\tku\t\tla-taka-ini',
   'NOM\tLaucu\tmany\tOBL\tPL-older_brother/sister-3SG.POSS',
   '主格\tLaucu\t很多\t斜格\t複數-兄姐-他的.第三人稱單數.所有格',
   '',
   '#e Laucu has many older brothers/sisters',
   '#c Laucu 有很多兄姐',
   '#n ta-kalra, ta: 比「多」更多的意思, kalra: 「多」。人名放在前面不可用 ki 標記。']),
 (3,
  ['ka\t\tLaucu\tkadrua\tku\t\tababay\tku\t',
   'NOM\tLaucu\tNEG\tOBL\tfemale\tOBL',
   '主格\tLaucu\t沒有\t斜格\t女性\t斜格',
   '',
   'taka-ini',
   'older_brother/sister-3SG.POSS',
   '兄姐-他的.第三人稱單數.所有格',
   '',
   '#e Laucu has no older sister.',
   '#c Laucu\t沒有姊姊。',
   '#n']),
 (4,
  ['kadrua\tku\t\tababay\tku\t\tagi',
   'NEG\tOBL\tfemale\tOBL\tyonger_brother/sister',
   '沒有\t斜格\tfemale\t斜格\t

['1.', 'yakay\tku\ttatulru\t(ku\tababay/sauvalay)\tku', 'have\t\t\tthree\t\tfemale/male', '有\t\t\t3\t\t女性/男性\t\t\t\t', '\t\t', 'agi-li', 'yonger_brother/sister-1SG.POSS', '弟妹-我的.第一人稱單數.所有格', '', '#e I have 3 younger brother/sister', '#c 我有 3 個弟弟/妹妹', '#n  yakay\tku 可省略']
['2.', 'ka\t\tLaucu\ttakalra\tku\t\tla-taka-ini', 'NOM\tLaucu\tmany\tOBL\tPL-older_brother/sister-3SG.POSS', '主格\tLaucu\t很多\t斜格\t複數-兄姐-他的.第三人稱單數.所有格', '', '#e Laucu has many older brothers/sisters', '#c Laucu 有很多兄姐', '#n ta-kalra, ta: 比「多」更多的意思, kalra: 「多」。人名放在前面不可用 ki 標記。']
['3.', 'ka\t\tLaucu\tkadrua\tku\t\tababay\tku\t', 'NOM\tLaucu\tNEG\tOBL\tfemale\tOBL', '主格\tLaucu\t沒有\t斜格\t女性\t斜格', '', 'taka-ini', 'older_brother/sister-3SG.POSS', '兄姐-他的.第三人稱單數.所有格', '', '#e Laucu has no older sister.', '#c Laucu\t沒有姊姊。', '#n']
['4.', 'kadrua\tku\t\tababay\tku\t\tagi', 'NEG\tOBL\tfemale\tOBL\tyonger_brother/sister', '沒有\t斜格\tfemale\t斜格\tyonger_brother/sister', '', '#e Don’t you have a younger sister', '#c (你)沒有妹妹嗎？', '#n 不禮貌']
['

In [31]:
pat_start = re.compile("^(\d{1,2})\.")
x = pat_start.match("1.")

In [33]:
x[1]

'1'

In [65]:
gloss

[('yakay', 'have', '有'),
 ('ku', 'three', '3'),
 ('tatulru', 'female/male', '女性/男性'),
 ('(ku', 'yonger_brother/sister-1SG.POSS', '弟妹-我的.第一人稱單數.所有格'),
 ('ababay/sauvalay)', '_', '_'),
 ('ku', '_', '_'),
 ('agi-li', '_', '_')]

In [76]:
len(p.result[0]['data'])

28

## 使用說明

### STEP 1: 先實例化`CorpusProcessor`物件

```
p = CorpusProcessor()
```

也可以指名`.docx`檔所在路徑:

```
p = CorpusProcesesor(path="<檔案所在路徑>")
```

若不指定path，則預設為與此`.ipynb`檔相同路徑

### STEP 2-1: 簡單搜尋

直接呼叫`easy_search()`方法，會跳出輸入框

```
p.easy_search()
```

### STEP 2-2: 或者你也可以使用regular expression進行搜尋

```
p.regex_search(regex)
```

此方法接收一個regex的字串參數，此字串會先丟入`re.compile()`當中
