# Initialization

In [1]:
import json
import logging
import math
import requests
import jieba
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Normal
logging.getLogger().setLevel(logging.WARNING)
# Debug
# logging.getLogger().setLevel(logging.INFO)

In [4]:
class Wiki(dict):
    def __init__(self):
        super(Wiki, self).__init__()
    def __getitem__(self, key):
        if key in self:
            return super(Wiki, self).__getitem__(key)
        return ReverseIndex(key, [])
class ReverseIndex(object):
    def __init__(self, key, indexes):
        super(ReverseIndex, self).__init__()
        self.key = key
        self.indexes = indexes
    def __len__(self):
        return len(self.indexes)
    def __getitem__(self, key):
        return self.indexes[key]
    def __iter__(self):
        return iter(self.indexes)
    def __and__(self, rev_ind):
        return set(self.indexes).intersection(set(rev_ind.indexes))
def tf(term, doc):
    """calculate term frequency
    
    Arguments:
        term {str} -- keyword to be evaluate
        doc {list} -- list of words splited from source document
    
    Returns:
        int -- times that term appears in doc
    """
    return doc.count(term)
def df(term, docs):
    """calculate document frequency from a series of documents
    
    This is a modified version which use df directly from wiki reverse index.
    For normal version, invert the comment
    
    Arguments:
        term {str} -- keyword to be evaluate
        docs {list} -- list doc defined by list of words splited from source document
    
    Returns:
        int -- times that term appears in docs
    """
    # count = 0
    # for doc in docs:
    #     count += doc.count(term)
    docs.setdefault(term, [])
    return len(docs[term])
def idf(df, docs_len):
    """convert document frequency into inverse document frequency
    
    Arguments:
        df {int} -- document frequency
        docs_len {int} -- total number of docs
    
    Returns:
        float -- inverse document frequency
    """
    return math.log10(docs_len/df)
def tf_idf(term, doc, docs):
    """Define how important a term is in a document
    
    higher the value, term is more important to the doc.
    An important term should be:
        * Appears in few docs
        * Appears many times in doc
    
    Arguments:
        term {str} -- keyword to be evaluate
        doc {list} -- document that keyword are to be rated, in this cse: question
        docs {list} -- all documents, as an idf reference, in this case: WIKI
    
    Returns:
        float -- tf-idf value
    """
    term_tf = tf(term, doc)
    term_df = df(term, docs)
    if term_df > 0:
        term_idf = idf(term_df, len(docs))
        return (1 + math.log(term_tf)) * term_idf
    return 0

# Biuld Inverted Index

In [None]:
# '''WIKI = Wiki()'''
# WIKI = {}
# with open("/content/drive/MyDrive/wiki.plain.text.line.spark.rdd.format") as file:
#   lines = file.readlines()
#   for line in lines:
#     indexes = line.split()
#     key = indexes.pop(0)
#     for index in indexes:
#       WIKI.setdefault(index, [])
#       WIKI[index].append(key)
#     '''indexes = list(map(int, indexes))
#     WIKI[key] = ReverseIndex(key, indexes)'''
#   for item in WIKI:
#     WIKI[item] = list(set(WIKI[item]))
# print("{} Keywords loaded".format(len(WIKI)))

In [None]:
# print(WIKI["美國"])

In [None]:
# import json
# output_file = open("/content/drive/MyDrive/inverted_index.json", "w", encoding='UTF-8')
# json.dump(WIKI, output_file, ensure_ascii=False)
# output_file.close()

# Load Inverted Index

In [5]:
import json
file = open("/content/drive/MyDrive/inverted_index.json", "r", encoding='UTF-8')
WIKI = json.load(file)
file.close()
print(type(WIKI))

<class 'dict'>


# Question Solving

## Method 1: Pure Term Frequency

* Return occurrence percentage of each answer

In [6]:
def solve_method_one(splited_question_dict):
    splited_question_dict = splited_question_dict.copy()
    splited_question_dict.pop("Question")
    scores = {}
    for option, option_term in splited_question_dict.items():
        WIKI.setdefault(option_term, [])
        scores[option] = len(WIKI[option_term])
    percentage_scores = {}
    for option in splited_question_dict:
        percentage_scores[option] = scores[option] / (sum(scores.values()) or 1)
    return percentage_scores

## Method 2: TF-IDF

1. Fetch keywords from question
2. use (keyword, option) pair to calculate relation

In [7]:
QUESTION_KEYWORD_THRESHOLD = 2.5
def find_question_keywords(splited_question):
    keywords = {}
    for each in splited_question:
        score = tf_idf(each, splited_question, WIKI)
        if score > QUESTION_KEYWORD_THRESHOLD:
            keywords[each] = score
    return keywords
def two_term_interaction_freq(option_term, qestion_keyword, keyword_weight):
    interaction_freq = len(set(WIKI[option_term]) & set(WIKI[qestion_keyword]))
    score = interaction_freq * keyword_weight
    logging.info("{1:{0}<7} & {2:{0}<7} : {3:{0}<7} * {4:{0}<7} => {5:{0}<7}".format(
        chr(12288),
        option_term,
        qestion_keyword,
        interaction_freq,
        keyword_weight,
        score
        )
    )
    return score
def solve_method_two(splited_question_dict):
    splited_question_dict = splited_question_dict.copy()
    keywords = find_question_keywords(splited_question_dict.pop("Question"))
    scores = {}
    for option, option_term in splited_question_dict.items():
        scores[option] = 0
        for keyword, keyword_weight in keywords.items():
            scores[option] += two_term_interaction_freq(option_term, keyword, keyword_weight)
    percentage_scores = {}
    for option in splited_question_dict:
        percentage_scores[option] = scores[option] / (sum(scores.values()) or 1)
    return percentage_scores

## Method 3: Human Take Control

* If answer percentage of Method 1 & Method 2 are all below `CONFIDENCE` => Shout out to human
* Method 1 & Method 2 had different answers => Use Method 2's Answer

In [8]:
CONFIDENCE = 0.5
def double_check(result):
    method_1_ans = max(result["1"], key=result["1"].get)
    method_2_ans = max(result["2"], key=result["2"].get)
    error = None
    if method_1_ans != method_2_ans:
        error = Warning("Different Answer")
    if max(result["1"][method_1_ans], result["2"][method_2_ans]) == 0:
        error = Warning("No Data")
    elif max(result["1"][method_1_ans], result["2"][method_2_ans]) < CONFIDENCE:
        error = Warning("Low Confidence")
    return method_2_ans, error
def print_error(error, result):
    if not isinstance(error, Warning):
        return
    logging.warning("Question {}: {}".format(question_num, str(error)))
    logging.warning("Method 1: {}".format(result["1"]))
    logging.warning("Method 2: {}".format(result["2"]))
def print_question(question):
    print(question["Question"])
    # print("https://www.google.com/search?q={}".format(question["Question"]))
    print("A. {}".format(question["A"]))
    print("B. {}".format(question["B"]))
    print("C. {}".format(question["C"]))
def validate_final_results(final_results, length):
    if len(final_results) != length:
        raise AssertionError("Wrong Length")
    ok = True
    for question_num, ans in enumerate(final_results):
        if ans != "A" and ans != "B" and ans != "C":
            ok = False
            logging.warning("Question {}: invalid answer => {}".format(question_num, ans))
    if not ok:
        raise AssertionError("Invalid Answer")
    print()
    print(json.dumps(final_results))
    return True

## Integrated Solve Function

In [9]:
def run_solve(question):
    # Create a working copy
    working_copy = question.copy()
    result = {}
    
    # Split Question into list of terms
    working_copy["Question"] = list(jieba.cut(working_copy["Question"], cut_all=False))
    
    # Solve with two different method
    result["1"] = solve_method_one(working_copy)
    result["2"] = solve_method_two(working_copy)
    
    # Check Results
    answer, error = double_check(result)
    
    # Return result
    return answer, error, result

# Game On

In [10]:
import json
with open('/content/drive/MyDrive/Stupid_QA_4_Question.json', 'r', encoding='UTF-8') as f:
  QUESTIONS = json.load(f)
  f.close()

In [None]:
# import json
# import requests as req


# question_url = "https://www.dropbox.com/s/kq678nfffzta5zg/20_Question.json?dl=1" #輸入老師給的網址
# QUESTIONS = req.get(question_url).json()

In [47]:
RESULTS = [{} for _ in range(len(QUESTIONS))]
ERRORS = {}
HANDLE = []
FINAL = []
for question_num, question in enumerate(QUESTIONS):
    answer, error, result = run_solve(question)
    if error:
        ERRORS[question_num] = error
        logging.warning("Question {}: {}".format(question_num, str(error)))
        if str(error) == "Low Confidence" or str(error) == "No Data":
          HANDLE.append(question_num)
    FINAL.append(answer)
    RESULTS[question_num] = result



In [48]:
len(ERRORS)

62

In [49]:
len(HANDLE)

18

## Final Check

In [50]:
validate_final_results(FINAL, len(QUESTIONS))


["A", "C", "A", "B", "B", "C", "C", "A", "A", "B", "A", "B", "A", "C", "B", "A", "B", "A", "A", "A", "A", "C", "A", "B", "A", "A", "A", "C", "A", "B", "B", "A", "C", "B", "B", "C", "B", "B", "A", "C", "A", "C", "C", "B", "A", "A", "A", "C", "C", "C", "C", "B", "C", "A", "C", "A", "A", "A", "A", "A", "C", "B", "C", "B", "C", "C", "A", "B", "A", "C", "C", "A", "C", "A", "A", "B", "C", "C", "A", "A", "C", "C", "A", "B", "C", "C", "A", "C", "C", "B", "B", "B", "B", "A", "B", "B", "C", "B", "B", "B", "A", "B", "C", "B", "B", "A", "B", "C", "C", "C", "A", "A", "A", "B", "B", "A", "A", "A", "B", "B", "A", "B", "A", "A", "A", "B", "A", "A", "B", "C", "B", "B", "A", "C", "A", "B", "A", "A", "A", "C", "A", "C", "C", "B", "A", "B", "A", "C", "A", "A", "A", "B", "C", "C", "A", "B", "C", "B", "A", "B", "C", "B", "C", "A", "C", "A", "C", "C", "B", "A", "A", "C", "B", "C", "B", "A", "A", "A", "A", "C", "A", "B", "B", "B", "C", "B", "A", "B", "C", "B", "A", "A", "A", "C", "A", "C", "B", "B", "A", "C"

True

In [45]:
backup = json.dumps(FINAL)

In [46]:
print(backup)
print()

["A", "C", "A", "B", "B", "C", "C", "A", "A", "B", "A", "B", "A", "C", "B", "A", "B", "A", "A", "A", "A", "C", "A", "B", "A", "A", "A", "C", "A", "B", "B", "A", "C", "B", "B", "C", "B", "B", "A", "C", "A", "C", "C", "B", "A", "A", "A", "C", "C", "C", "C", "B", "C", "A", "C", "A", "A", "A", "A", "A", "C", "B", "C", "B", "C", "C", "A", "B", "A", "C", "C", "A", "C", "A", "A", "B", "C", "C", "A", "A", "C", "C", "A", "B", "C", "C", "A", "C", "C", "B", "B", "B", "B", "A", "B", "B", "C", "B", "B", "B", "A", "B", "C", "B", "B", "A", "B", "C", "C", "C", "A", "A", "A", "B", "B", "A", "A", "A", "B", "B", "A", "B", "A", "A", "A", "B", "A", "A", "B", "C", "B", "B", "A", "C", "A", "B", "A", "A", "A", "C", "A", "C", "C", "B", "A", "B", "A", "C", "A", "A", "A", "B", "C", "C", "A", "B", "C", "B", "A", "B", "C", "B", "C", "A", "C", "A", "C", "C", "B", "A", "A", "C", "B", "C", "B", "A", "A", "A", "A", "C", "A", "B", "B", "B", "C", "B", "A", "B", "C", "B", "A", "A", "A", "C", "A", "C", "B", "B", "A", "C"]

# Manual Fix

In [37]:
for question_num in HANDLE:
  print_error(ERRORS[question_num], RESULTS[question_num])
  print_question(QUESTIONS[question_num])
  print()
  print("Original Answer: {}".format(FINAL[question_num]))
  print()
  FINAL[question_num] = input("Answer of {}: ".format(question_num))



內河碼頭站位於美國的哪一個城市?
A. 芝加哥
B. 舊金山
C. 西雅圖

Original Answer: B

Answer of 14: B




西非國家，位於非洲的幾內亞灣西岸頂點，鄰國包括西邊的貝南，北邊的尼日，東北方與查德接壤一小段國界，正東則是喀麥隆。是全非洲人口最多的國家，首都原本為西南沿海的海港城市拉哥斯，1991年12月遷都至地理位置位居全國國土正中央的阿布札。
A. 瑞士
B. 奈及利亞
C. 奧地利

Original Answer: A

Answer of 26: B




下列哪一個是中華民國國花?
A. 牡丹
B. 梅花
C. 蘭花

Original Answer: B

Answer of 30: A




胡盈禎的父親是誰?
A. 張菲
B. 胡瓜
C. 張小燕

Original Answer: B

Answer of 37: B




韓原之戰是由誰討伐晉國?
A. 秦穆公
B. 周襄王
C. 楚莊王

Original Answer: A

Answer of 40: A




明朝鄭和下西洋，最遠到哪裡?
A. 美洲
B. 澳洲
C. 非洲

Original Answer: C

Answer of 65: C




在2005年的中環盃中，崔哲瀚擊敗了哪一個人，奪得冠軍?
A. 李世乭
B. 金志錫
C. 羋昱廷

Original Answer: A

Answer of 82: C




下列哪個是瑞士的官方語言?
A. 英語
B. 中文
C. 德語

Original Answer: A

Answer of 126: C




是美國加利福尼亞州北部的一座都市，以其涼爽的夏季、多霧、綿延的丘陵地形、混合的建築風格，和金門大橋、纜車、惡魔島監獄及中國城等景點聞名。此外也是五大主要銀行和許多大型公司、機構的總部所在，包括GAP、太平洋瓦電公司、Yelp、Pinterest、Twitter、優步、Mozilla和Craigslist等。
A. 馬尼拉
B. 首爾
C. 舊金山

Original Answer: C

Answer of 133: C




《一代宗師》所講的是詠春高手葉問及一眾武術家的故事，請問片中宮二為何人所演?
A. 舒淇
B. 范冰冰
C. 章子怡

Original Answer: C

Answer of 195: C


# Final Check

In [38]:
validate_final_results(FINAL, len(QUESTIONS))


["A", "C", "A", "B", "B", "C", "C", "A", "A", "B", "A", "B", "A", "C", "B", "A", "B", "A", "A", "A", "A", "C", "A", "B", "A", "A", "B", "C", "A", "B", "A", "A", "C", "B", "B", "C", "B", "B", "A", "C", "A", "C", "C", "B", "A", "A", "A", "C", "C", "C", "C", "B", "C", "A", "C", "A", "A", "A", "A", "A", "C", "B", "C", "B", "C", "C", "A", "B", "A", "C", "C", "A", "C", "A", "A", "B", "C", "C", "A", "A", "C", "C", "C", "B", "C", "C", "A", "C", "C", "B", "B", "B", "B", "A", "B", "B", "C", "B", "B", "B", "A", "B", "C", "B", "B", "A", "B", "C", "C", "C", "A", "A", "A", "B", "B", "A", "A", "A", "B", "B", "A", "B", "A", "A", "A", "B", "C", "A", "B", "C", "B", "B", "A", "C", "A", "B", "A", "A", "A", "C", "A", "C", "C", "B", "A", "B", "A", "C", "A", "A", "A", "B", "C", "C", "A", "B", "C", "B", "A", "B", "C", "B", "C", "A", "C", "A", "C", "C", "B", "A", "A", "C", "B", "C", "B", "A", "A", "A", "A", "C", "A", "B", "B", "B", "C", "B", "A", "B", "C", "B", "A", "A", "A", "C", "A", "C", "B", "B", "A", "C"

True

In [39]:
backup = json.dumps(FINAL)

In [40]:
print(backup)
print()

["A", "C", "A", "B", "B", "C", "C", "A", "A", "B", "A", "B", "A", "C", "B", "A", "B", "A", "A", "A", "A", "C", "A", "B", "A", "A", "B", "C", "A", "B", "A", "A", "C", "B", "B", "C", "B", "B", "A", "C", "A", "C", "C", "B", "A", "A", "A", "C", "C", "C", "C", "B", "C", "A", "C", "A", "A", "A", "A", "A", "C", "B", "C", "B", "C", "C", "A", "B", "A", "C", "C", "A", "C", "A", "A", "B", "C", "C", "A", "A", "C", "C", "C", "B", "C", "C", "A", "C", "C", "B", "B", "B", "B", "A", "B", "B", "C", "B", "B", "B", "A", "B", "C", "B", "B", "A", "B", "C", "C", "C", "A", "A", "A", "B", "B", "A", "A", "A", "B", "B", "A", "B", "A", "A", "A", "B", "C", "A", "B", "C", "B", "B", "A", "C", "A", "B", "A", "A", "A", "C", "A", "C", "C", "B", "A", "B", "A", "C", "A", "A", "A", "B", "C", "C", "A", "B", "C", "B", "A", "B", "C", "B", "C", "A", "C", "A", "C", "C", "B", "A", "A", "C", "B", "C", "B", "A", "A", "A", "A", "C", "A", "B", "B", "B", "C", "B", "A", "B", "C", "B", "A", "A", "A", "C", "A", "C", "B", "B", "A", "C"]