本版本考慮詞頻之影響，取消詞性的篩選，並透過自然指數的方式來調整權重

In [1]:
import json
from ckip_transformers.nlp import CkipWordSegmenter
import re


In [2]:
# config

question = './questions_example.json'
reg = "[\s\-，.。\:：！!；;\?？（）\(\)\"\'《》〈〉．～—─\=「」『』、”“·／\#\[\]\$,－]"
dbInvertedIndex = './convert-to-db/inverted-index.db'
dbWordFrequency = './word-frequency/word-frequency.db'

In [3]:
ws_driver = CkipWordSegmenter(level=1, device=0)


In [4]:
with open(question, encoding='utf8') as j:
    qs = json.load(j)


In [5]:
def wsAndPos(text): # text is string list
    for i, _ in enumerate(text):
        text[i] = re.sub(reg, " ", text[i])
    ws = ws_driver(text, show_progress=False)
    return ws


In [6]:
import sqlite3

conn = sqlite3.connect(dbInvertedIndex)
cursor = conn.cursor()

def findAidList(word):
    sql = "SELECT aids from mapping WHERE word = ?"
    res = cursor.execute(sql, (word,)).fetchone()
    if res is None:
        return []
    else:
        return res[0].split()

conn2 = sqlite3.connect(dbWordFrequency)
cursor2 = conn2.cursor()
def getFrequency(word):
    sql = "SELECT num from frequency WHERE word = ?"
    res = cursor2.execute(sql, (word,)).fetchone()
    return 0 if res is None else int(res[0])


In [7]:
def findAidListByWordList(wordList):
    ret = []
    for word in wordList:
        ret.append(findAidList(word))
    return ret


In [8]:
import math

# 自然指數最高可以到 710 次方
threshold = 700

def evaluate(questionAids, questionWords, answerAids):
    score = 0
    for a1, w1 in zip(questionAids, questionWords):
        # 當 frequency 過大時 exponential 也算不出來因此在此把他欄截
        # 透過此方式減少 intersection 計算量

        f = getFrequency(w1)
        if f > threshold:
            continue
        set1 = set(a1)
        for a2 in answerAids:
            set2 = set(a2)
            intersection = len(set1.intersection(set2))
            score += intersection / math.exp(f)            
    return score


In [9]:
myAnswer = []
for idx, q in enumerate(qs):
    # 將題目、選項都先進行斷詞
    wordList2D = wsAndPos(list(q.values()))
    
    scoreList = {}
    scoreSum = 0.000001
    maxK = 'C' # 預設答案猜 C
    maxV = 0
    for k, words in zip(q.keys(), wordList2D):
        if k == 'Question':
            questionAid = findAidListByWordList(words)
            questionWords = words
        else:
            answerAid = findAidListByWordList(words)
            score = evaluate(questionAid, questionWords, answerAid)
            scoreList[k] = score
            scoreSum += score
            if score > maxV:
                maxK = k
                maxV = score
    myAnswer.append(maxK)
    # normalize
    for k, v in scoreList.items():
        scoreList[k] = '%g%%' % (v/scoreSum*100,)

    print(scoreList)


{'A': '3.38335e-74%', 'B': '99.9255%', 'C': '1.35334e-74%'}
{'A': '36.3636%', 'B': '18.1818%', 'C': '45.4545%'}
{'A': '0%', 'B': '99.9926%', 'C': '4.30695e-152%'}
{'A': '0%', 'B': '3.48887e-06%', 'C': '2.31952e-08%'}
{'A': '98.0432%', 'B': '0%', 'C': '0%'}
{'A': '2.93817e-87%', 'B': '1.68773e-195%', 'C': '1.04935e-88%'}
{'A': '15.1485%', 'B': '7.57426%', 'C': '68.1684%'}
{'A': '0.0113241%', 'B': '2.50628e-38%', 'C': '0%'}
{'A': '99.98%', 'B': '5.09714e-09%', 'C': '4.5371e-217%'}
{'A': '2.38027e-18%', 'B': '1.42816e-17%', 'C': '9.52107e-18%'}
{'A': '7.78113e-12%', 'B': '1.08936e-10%', 'C': '1.55623e-11%'}
{'A': '3.28001e-106%', 'B': '6.108e-45%', 'C': '4.92002e-106%'}
{'A': '6.54284e-61%', 'B': '1.5496e-12%', 'C': '2.6852e-266%'}
{'A': '0%', 'B': '0%', 'C': '1.03567e-84%'}
{'A': '99.9913%', 'B': '0%', 'C': '0%'}
{'A': '1.61237%', 'B': '0.201546%', 'C': '0.403092%'}
{'A': '2.83283e-43%', 'B': '3.96597e-43%', 'C': '1.6997e-43%'}
{'A': '0%', 'B': '0%', 'C': '0%'}
{'A': '6.76931e-88%', 'B':

In [10]:
json.dumps(myAnswer)


'["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B", "B", "B", "C", "A", "A", "B", "C", "C", "B", "C", "C", "C", "A", "A"]'

In [11]:
same = 0
exampleAnswer = ["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B",
                 "B", "B", "C", "A", "A", "B", "B", "C", "B", "C", "C", "C", "A", "A"]
for i,j in zip(myAnswer, exampleAnswer):
    if i == j:
        same += 1
print(same, "/", len(myAnswer))


24 / 25
