# Get Result from Inverted-Index Table

1. 將關鍵字先做文字分割
2. 於資料庫的 inverted_table 表做交集查找
3. 由於交集出來的關鍵字可能位置不相鄰，因此再透過資料庫的 data 表查找一次

In [1]:
keyword_answer = [
    ("人心惶惶", [13958, 21027, 27575, 34876, 37680,
              38208, 38754, 45590, 47095, 49915]),
    ("縱火犯", [13958, 37537]),
    ("印尼羽毛球", [13960, 13974, 17725, 18516, 21413, 26119]),
    ("生長激素", [13961, 27962]),
    ("胰高血糖素", [13961, 19650, 21408])
]


In [2]:
keywords = []
answers = []
for ka in keyword_answer:
    keywords.append(ka[0])
    answers.append(ka[1])
print(keywords)

['人心惶惶', '縱火犯', '印尼羽毛球', '生長激素', '胰高血糖素']


In [3]:
from ckiptagger import data_utils, WS

# Download models
#data_utils.download_data_gdown("./")

ws = WS("./data")
keywords = ws(keywords, sentence_segmentation=False)

print(keywords)




[['人心惶惶'], ['縱火犯'], ['印尼', '羽毛球'], ['生長', '激素'], ['胰', '高血糖素']]


In [22]:
import sqlite3
database = './table.db'
conn = sqlite3.connect(database)
cursor = conn.cursor()

# 資料庫搜尋時必尋處理連續性問題
# 因為可能出現文字分割後關鍵字前後不連續的問題
# 另外，由於老師給的答案是 index_id 不是 article_id 所以回傳 index_id

my_answers = []
for key in keywords:
    like = '%%%s%%' % (''.join(key))
    sql = 'SELECT aid FROM inverted_table WHERE term=? '
    if len(key) > 1:
        for i in range(len(key) - 1):
            sql += 'INTERSECT SELECT aid FROM inverted_table WHERE term=? '
    sql = """
        SELECT d.index_id FROM (""" + sql + """) as a, data as d
        WHERE (d.articles like ? or d.title like ?) and a.aid = d.aid 
    """
    #print(sql)
    rows = cursor.execute(sql, tuple(key)+(like, like))
    res = rows.fetchall()

    print(''.join(key), [r[0] for r in res])
    my_answers.append([r[0] for r in res])


人心惶惶 [13958, 21027, 27575, 34876, 37680, 38208, 38754, 45590, 47095, 49915]
縱火犯 [13958, 37537]
印尼羽毛球 [13960, 13974, 17725, 18516, 21413, 26119]
生長激素 [13961, 27962]
胰高血糖素 [13961]


# Check Answer

In [21]:
def jacacard(list1, list2):
    list1, list2 = sorted(list1), sorted(list2)
    i, j = 0, 0
    intersection = 0
    union = 0
    while i < len(list1) and j < len(list2):
        if list1[i] == list2[i]:
            intersection += 1
            union += 1
            i += 1
            j += 1
        elif list1[i] < list2[i]:
            list1[i] += 1
            union += 1
        elif list1[i] > list2[i]:
            list2[i] += 1
            union += 1
    if i < len(list1):
        union += len(list1) - i
    if j < len(list2):
        union += len(list2) - i
    return intersection / union

for i in range(len(keyword_answer)):
    print(keyword_answer[i][0], jacacard(keyword_answer[i][1], my_answers[i]))


人心惶惶 1.0
縱火犯 1.0
印尼羽毛球 1.0
生長激素 1.0
胰高血糖素 0.3333333333333333
