本版本考慮詞頻之影響，取消詞性的篩選，並透過自然指數的方式來調整權重

In [1]:
import json
from ckip_transformers.nlp import CkipWordSegmenter
import re


In [2]:
# config

question = './questions_example.json'
reg = "[\s\-，.。\:：！!；;\?？（）\(\)\"\'《》〈〉．～—─\=「」『』、”“·／\#\[\]\$,－]"
db = './convert-to-db/inverted-index.db'

In [3]:
ws_driver = CkipWordSegmenter(level=1, device=0)


In [4]:
with open(question, encoding='utf8') as j:
    qs = json.load(j)


In [5]:
def wsAndPos(text): # text is string list
    for i, _ in enumerate(text):
        text[i] = re.sub(reg, " ", text[i])
    ws = ws_driver(text, show_progress=False)
    return ws


In [6]:
import sqlite3

conn = sqlite3.connect(db)
cursor = conn.cursor()

def findAidList(word):
    sql = "SELECT aids from mapping WHERE word = ?"
    res = cursor.execute(sql, (word,)).fetchone()
    if res is None:
        return []
    else:
        return res[0].split()

def getFrequency(word):
    sql = "SELECT num from mapping WHERE word = ?"
    res = cursor.execute(sql, (word,)).fetchone()
    return 0 if res is None else int(res[0])


In [7]:
def findAidListByWordList(wordList):
    ret = []
    for word in wordList:
        ret.append(findAidList(word))
    return ret


In [8]:
import math

# 自然指數最高可以到 710 次方
threshold = 700

def evaluate(questionAids, questionWords, answerAids):
    score = 0
    for a1, w1 in zip(questionAids, questionWords):
        # 當 frequency 過大時 exponential 也算不出來因此在此把他欄截
        # 透過此方式減少 intersection 計算量

        f = getFrequency(w1)
        if f > threshold:
            continue
        set1 = set(a1)
        for a2 in answerAids:
            set2 = set(a2)
            intersection = len(set1.intersection(set2))
            score += intersection / math.exp(f)            
    return score


In [10]:
myAnswer = []
for idx, q in enumerate(qs):
    # 將題目、選項都先進行斷詞
    wordList2D = wsAndPos(list(q.values()))
    
    scoreList = {}
    scoreSum = 1e-140
    maxK = 'C' # 預設答案猜 C
    maxV = 0
    for k, words in zip(q.keys(), wordList2D):
        if k == 'Question':
            questionAid = findAidListByWordList(words)
            questionWords = words
        else:
            answerAid = findAidListByWordList(words)
            score = evaluate(questionAid, questionWords, answerAid)
            scoreList[k] = score
            scoreSum += score
            if score > maxV:
                maxK = k
                maxV = score
    myAnswer.append(maxK)
    # normalize
    for k, v in scoreList.items():
        scoreList[k] = '%g%%' % (v/scoreSum*100,)

    print(scoreList)


{'A': '5.49585e-34%', 'B': '100%', 'C': '1.2213e-34%'}
{'A': '33.4672%', 'B': '16.9344%', 'C': '49.5984%'}
{'A': '0%', 'B': '100%', 'C': '2.51313e-137%'}
{'A': '6.42406e-203%', 'B': '57.8123%', 'C': '42.1877%'}
{'A': '100%', 'B': '0%', 'C': '8.23193e-274%'}
{'A': '96.5517%', 'B': '1.42253e-83%', 'C': '3.44828%'}
{'A': '15.3846%', 'B': '7.69231%', 'C': '76.9231%'}
{'A': '100%', 'B': '7.73174e-15%', 'C': '0%'}
{'A': '100%', 'B': '2.7835e-07%', 'C': '1.9854e-168%'}
{'A': '9.09091%', 'B': '54.5455%', 'C': '36.3636%'}
{'A': '5.88235%', 'B': '82.3529%', 'C': '11.7647%'}
{'A': '2.7842e-38%', 'B': '100%', 'C': '4.17631e-38%'}
{'A': '8.05335e-26%', 'B': '100%', 'C': '1.62874e-68%'}
{'A': '0%', 'B': '0%', 'C': '100%'}
{'A': '100%', 'B': '0%', 'C': '0%'}
{'A': '72.7273%', 'B': '9.09091%', 'C': '18.1818%'}
{'A': '33.3333%', 'B': '46.6667%', 'C': '20%'}
{'A': '0%', 'B': '0%', 'C': '0%'}
{'A': '7.53347e-52%', 'B': '0.664835%', 'C': '99.3352%'}
{'A': '0%', 'B': '100%', 'C': '0%'}
{'A': '11.7647%', 'B

In [11]:
json.dumps(myAnswer)


'["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B", "B", "B", "C", "A", "A", "B", "C", "C", "B", "C", "C", "C", "A", "A"]'

In [12]:
same = 0
exampleAnswer = ["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B",
                 "B", "B", "C", "A", "A", "B", "B", "C", "B", "C", "C", "C", "A", "A"]
for i,j in zip(myAnswer, exampleAnswer):
    if i == j:
        same += 1
print(same, "/", len(myAnswer))


24 / 25
