本版本考慮詞頻

In [1]:
import json
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger
import re


In [2]:
# config

question = './questions_example.json'
reg = "[\s\-，.。\:：！!；;\?？（）\(\)\"\'《》〈〉．～—─\=「」『』、”“·／\#\[\]\$,－]"
dbInvertedIndex = './convert-to-db/inverted-index.db'
dbWordFrequency = './word-frequency/word-frequency.db'

In [3]:
ws_driver = CkipWordSegmenter(level=1, device=0)
pos_driver = CkipPosTagger(level=1, device=0)


In [4]:
with open(question, encoding='utf8') as j:
    qs = json.load(j)


In [5]:
def wsAndPos(text): # text is string list
    for i, _ in enumerate(text):
        text[i] = re.sub(reg, " ", text[i])

    ws = ws_driver(text, show_progress=False)
    pos = pos_driver(ws, show_progress=False)
    resList = []

    for w, p in zip(ws, pos):
        res = []
        for token_w, token_p in zip(w, p):
            #if token_p[0] not in ['C', 'T', 'I', 'P', 'S', 'W', 'D']:
            #    res.append(token_w)
            if token_p[0] in ['N', 'F']:
                res.append(token_w)

        resList.append(res)
    return resList


In [6]:
import sqlite3

conn = sqlite3.connect(dbInvertedIndex)
cursor = conn.cursor()

def findAidList(word):
    sql = "SELECT aids from mapping WHERE word = ?"
    res = cursor.execute(sql, (word,)).fetchone()
    if res is None:
        return []
    else:
        return res[0].split()

conn2 = sqlite3.connect(dbWordFrequency)
cursor2 = conn2.cursor()
def getFrequency(word):
    sql = "SELECT num from frequency WHERE word = ?"
    res = cursor2.execute(sql, (word,)).fetchone()
    return 0 if res is None else int(res[0])


In [7]:
def findAidListByWordList(wordList):
    ret = []
    for word in wordList:
        ret.append(findAidList(word))
    return ret


In [8]:

def evaluate(questionAids, questionWords, answerAids):
    score = 0
    for a1, w1 in zip(questionAids, questionWords):
        for a2 in answerAids:
            set1, set2 = set(a1), set(a2)
            intersection = set1.intersection(set2)
            score += len(intersection) / (getFrequency(w1) + 0.0001)
    return score


In [9]:
myAnswer = []
for idx, q in enumerate(qs):
    # 將題目、選項都先進行斷詞
    wordList2D = wsAndPos(list(q.values()))
    
    scoreList = {}
    scoreSum = 0.000001
    maxK = 'C' # 預設答案猜 C
    maxV = 0
    for k, words in zip(q.keys(), wordList2D):
        if k == 'Question':
            questionAid = findAidListByWordList(words)
            questionWords = words
        else:
            answerAid = findAidListByWordList(words)
            score = evaluate(questionAid, questionWords, answerAid)
            scoreList[k] = score
            scoreSum += score
            if score > maxV:
                maxK = k
                maxV = score
    myAnswer.append(maxK)
    # normalize
    for k, v in scoreList.items():
        scoreList[k] = str(v/scoreSum*100) + '%'

    print(scoreList)


{'A': '25.71453999284393%', 'B': '71.14825653770286%', 'C': '3.137175357403831%'}
{'A': '29.924171230718517%', 'B': '26.078606071099447%', 'C': '43.99721960948568%'}
{'A': '1.4757167429667821%', 'B': '47.03409861281021%', 'C': '51.4901441282786%'}
{'A': '2.5648828985902408%', 'B': '56.20596626124744%', 'C': '41.228583541615436%'}
{'A': '99.29437630188657%', 'B': '0.0013530705187736137%', 'C': '0.7041965133211014%'}
{'A': '76.66798849321414%', 'B': '6.754797078038781%', 'C': '16.577164959168034%'}
{'A': '40.93791546057415%', 'B': '9.9462061191511%', 'C': '49.11586384074981%'}
{'A': '98.31867234529544%', 'B': '0.26135930626442166%', 'C': '1.4193732533893126%'}
{'A': '77.47389160724985%', 'B': '18.8654902557756%', 'C': '3.6605705856958632%'}
{'A': '11.206370936090527%', 'B': '28.150057231559618%', 'C': '60.643530837834035%'}
{'A': '30.28276927323722%', 'B': '38.263703761053854%', 'C': '31.453470766283736%'}
{'A': '2.6984363980708395%', 'B': '92.86810540473664%', 'C': '4.433373611614417%'}

In [10]:
json.dumps(myAnswer)


'["B", "C", "C", "B", "A", "A", "C", "A", "A", "C", "B", "B", "B", "C", "A", "A", "B", "A", "C", "B", "C", "C", "C", "A", "A"]'

In [11]:
same = 0
exampleAnswer = ["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B",
                 "B", "B", "C", "A", "A", "B", "B", "C", "B", "C", "C", "C", "A", "A"]
for i,j in zip(myAnswer, exampleAnswer):
    if i == j:
        same += 1
print(same, "/", len(myAnswer))


22 / 25
