In [1]:
import json
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger
import re


In [2]:
# config

question = './questions_example.json'
reg = "[\s\-，.。\:：！!；;\?？（）\(\)\"\'《》〈〉．～—─\=「」『』、”“·／\#\[\]\$,－]"
db = './convert-to-db/inverted-index.db'

In [3]:
ws_driver = CkipWordSegmenter(level=1, device=0)
pos_driver = CkipPosTagger(level=1, device=0)


In [4]:
with open(question, encoding='utf8') as j:
    qs = json.load(j)


In [5]:
def wsAndPos(text): # text is string list
    for i, _ in enumerate(text):
        text[i] = re.sub(reg, " ", text[i])

    ws = ws_driver(text, show_progress=False)
    pos = pos_driver(ws, show_progress=False)
    resList = []

    for w, p in zip(ws, pos):
        res = []
        for token_w, token_p in zip(w, p):
            #if token_p[0] not in ['C', 'T', 'I', 'P', 'S', 'W', 'D']:
            #    res.append(token_w)
            if token_p[0] in ['N', 'F']:
                res.append(token_w)

        resList.append(res)
    return resList


In [6]:
import sqlite3

conn = sqlite3.connect(db)
cursor = conn.cursor()

def findAidList(word):
    sql = "SELECT aids from mapping WHERE word = ?"
    res = cursor.execute(sql, (word,)).fetchone()
    if res is None:
        return []
    else:
        return res[0].split()


In [7]:
def findAidListByWordList(wordList):
    ret = []
    for word in wordList:
        ret.append(findAidList(word))
    return ret


In [8]:

# list1: question aid list 2d
# list2: candidate aid list 2d
def evaluate(list1, list2):
    score = 0
    for l1 in list1:
        for l2 in list2:
            set1, set2 = set(l1), set(l2)
            intersection = set1.intersection(set2)
            score += len(intersection) / (len(set1) + 0.0001)
            # Bad
            # (1) jaccord
            # union = set1.union(set2)
            # score += len(intersection) / (len(union) + 0.0001)
            # (2) only number of intersection
            # score += len(intersection)
    return score


In [9]:
myAnswer = []
for idx, q in enumerate(qs):
    # 將題目、選項都先進行斷詞
    wordList2D = wsAndPos(list(q.values()))
    
    scoreList = {}
    scoreSum = 0.000001
    maxK = 'C' # 預設答案猜 C
    maxV = 0
    for k, words in zip(q.keys(), wordList2D):
        if k == 'Question':
            questionAid = findAidListByWordList(words)
        else:
            answerAid = findAidListByWordList(words)
            score = evaluate(questionAid, answerAid)
            scoreList[k] = score
            scoreSum += score
            if score > maxV:
                maxK = k
                maxV = score
    myAnswer.append(maxK)
    # normalize
    for k, v in scoreList.items():
        scoreList[k] = str(v/scoreSum*100) + '%'

    print(scoreList)


{'A': '25.523869966777962%', 'B': '71.3412681077528%', 'C': '3.1348470161153252%'}
{'A': '29.508317302482865%', 'B': '26.669432082061345%', 'C': '43.822249091716934%'}
{'A': '1.5224073113202674%', 'B': '47.40579389643314%', 'C': '51.07177197621411%'}
{'A': '3.0072101857141553%', 'B': '51.002859115224894%', 'C': '45.98954782907557%'}
{'A': '99.16458713232801%', 'B': '0.0019363990524992741%', 'C': '0.8334276326087422%'}
{'A': '76.55857174858066%', 'B': '6.890561771656453%', 'C': '16.55083510077561%'}
{'A': '42.7242529040899%', 'B': '9.930161430618712%', 'C': '47.34557711708026%'}
{'A': '98.5491387631167%', 'B': '0.22292581224648347%', 'C': '1.2276369675694268%'}
{'A': '73.29794831595078%', 'B': '22.124892610345917%', 'C': '4.577128643760835%'}
{'A': '11.71963070049473%', 'B': '30.71820265238065%', 'C': '57.562146059482785%'}
{'A': '29.976113995883917%', 'B': '38.94334358501288%', 'C': '31.080504902359806%'}
{'A': '2.4578659100612654%', 'B': '93.60308847874327%', 'C': '3.938998344415369%'

In [10]:
json.dumps(myAnswer)


'["B", "C", "C", "B", "A", "A", "C", "A", "A", "C", "B", "B", "B", "C", "A", "A", "B", "A", "C", "B", "C", "C", "C", "A", "A"]'

In [11]:
same = 0
exampleAnswer = ["B", "C", "B", "B", "A", "A", "C", "A", "A", "B", "B",
                 "B", "B", "C", "A", "A", "B", "B", "C", "B", "C", "C", "C", "A", "A"]
for i,j in zip(myAnswer, exampleAnswer):
    if i == j:
        same += 1
print(same, "/", len(myAnswer))


22 / 25
