本版本考慮詞頻之影響，取消詞性的篩選，並透過自然指數的方式來調整權重

In [1]:
import json
from ckip_transformers.nlp import CkipWordSegmenter
import re


In [2]:
# config

question = './questions_example.json'
reg = "[\s\-，.。\:：！!；;\?？（）\(\)\"\'《》〈〉．～—─\=「」『』、”“·／\#\[\]\$,－]"
db = './convert-to-db/inverted-index.db'

In [3]:
ws_driver = CkipWordSegmenter(level=1, device=0)


In [4]:
with open(question, encoding='utf8') as j:
    qs = json.load(j)


In [5]:
def wsAndPos(text): # text is string list
    for i, _ in enumerate(text):
        text[i] = re.sub(reg, " ", text[i])
    ws = ws_driver(text, show_progress=False)
    return ws


In [6]:
import sqlite3

conn = sqlite3.connect(db)
cursor = conn.cursor()

def findAidList(word):
    sql = "SELECT aids from mapping WHERE word = ?"
    res = cursor.execute(sql, (word,)).fetchone()
    if res is None:
        return []
    else:
        return res[0].split()

def getFrequency(word):
    sql = "SELECT num from mapping WHERE word = ?"
    res = cursor.execute(sql, (word,)).fetchone()
    return 0 if res is None else int(res[0])


In [7]:
def findAidListByWordList(wordList):
    ret = []
    for word in wordList:
        ret.append(findAidList(word))
    return ret


In [8]:
import math

# 自然指數最高可以到 710 次方
threshold = 700

def evaluate(questionAids, questionWords, answerAids):
    score = 0
    for a1, w1 in zip(questionAids, questionWords):
        # 當 frequency 過大時 exponential 也算不出來因此在此把他欄截
        # 透過此方式減少 intersection 計算量

        f = getFrequency(w1)
        if f > threshold:
            continue
        set1 = set(a1)
        for a2 in answerAids:
            set2 = set(a2)
            intersection = len(set1.intersection(set2))
            score += intersection / math.exp(f)            
    return score


In [9]:
myAnswer = []
for idx, q in enumerate(qs):
    # 將題目、選項都先進行斷詞
    wordList2D = wsAndPos(list(q.values()))
    
    scoreList = {}
    scoreSum = 0.000001
    maxK = 'C' # 預設答案猜 C
    maxV = 0
    for k, words in zip(q.keys(), wordList2D):
        if k == 'Question':
            questionAid = findAidListByWordList(words)
            questionWords = words
        else:
            answerAid = findAidListByWordList(words)
            score = evaluate(questionAid, questionWords, answerAid)
            scoreList[k] = score
            scoreSum += score
            if score > maxV:
                maxK = k
                maxV = score
    myAnswer.append(maxK)
    # normalize
    for k, v in scoreList.items():
        scoreList[k] = '%g%%' % (v/scoreSum*100,)

    print(scoreList)


{'A': '5.49435e-34%', 'B': '99.9726%', 'C': '1.22097e-34%'}
{'A': '33.467%', 'B': '16.9343%', 'C': '49.5982%'}
{'A': '0%', 'B': '99.9973%', 'C': '2.51306e-137%'}
{'A': '5.748e-207%', 'B': '0.00517282%', 'C': '0.0037748%'}
{'A': '99.2711%', 'B': '0%', 'C': '8.17193e-274%'}
{'A': '7.20219e-47%', 'B': '1.06112e-131%', 'C': '2.57221e-48%'}
{'A': '15.1944%', 'B': '7.59719%', 'C': '75.9719%'}
{'A': '98.0432%', 'B': '7.58045e-15%', 'C': '0%'}
{'A': '99.9973%', 'B': '2.78343e-07%', 'C': '1.98535e-168%'}
{'A': '6.91435e-05%', 'B': '0.000414861%', 'C': '0.000276574%'}
{'A': '0.000510865%', 'B': '0.0071521%', 'C': '0.00102173%'}
{'A': '1.34284e-70%', 'B': '4.82305e-31%', 'C': '2.01426e-70%'}
{'A': '2.22736e-31%', 'B': '0.000276575%', 'C': '4.5047e-74%'}
{'A': '0%', 'B': '0%', 'C': '1.52002e-39%'}
{'A': '99.9988%', 'B': '0%', 'C': '0%'}
{'A': '72.6737%', 'B': '9.08422%', 'C': '18.1684%'}
{'A': '1.14285e-40%', 'B': '1.59999e-40%', 'C': '6.85708e-41%'}
{'A': '0%', 'B': '0%', 'C': '0%'}
{'A': '7.5334

In [10]:
json.dumps(myAnswer)


'["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B", "B", "B", "C", "A", "A", "B", "C", "C", "B", "C", "C", "C", "A", "A"]'

In [11]:
same = 0
exampleAnswer = ["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B",
                 "B", "B", "C", "A", "A", "B", "B", "C", "B", "C", "C", "C", "A", "A"]
for i,j in zip(myAnswer, exampleAnswer):
    if i == j:
        same += 1
print(same, "/", len(myAnswer))


24 / 25
