In [256]:
import fasttext
import fasttext.util
import json
import nltk
from nltk.tokenize import RegexpTokenizer
from snowballstemmer import TurkishStemmer
import numpy as np
import random
from numpy.linalg import norm
# fasttext.util.download_model('tr', if_exists='ignore')

In [257]:
cos_sim = lambda q_vector, vector : np.dot(q_vector, vector)/(norm(q_vector)*norm(vector))

In [258]:
ft = fasttext.load_model(r'cc.tr.300.bin')
STOPWORD_LIST = nltk.corpus.stopwords.words('turkish')
CUSTOMWORD_LIST = ["wifi", "section", "metu", "office"] # custom word list

f = open("./answers.json")
ANSWERS = json.load(f)

f = open("./question_categories.json")
QUESTIONS = json.load(f)

QUESTION_VECTORS= {}

f = open("/Users/ilbey/Documents/ceng/ceng49x/metubot/Elasticsearch/qa_pairs.json")
QA = json.load(f)["qa-pairs"]



In [259]:
# preprocessor for quesiton strings to transform strings to word arrays.
# TODO: a better alternative than snowball stemmer should be used. 
def preProcessor(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+') # removes all punctuation
    words = tokenizer.tokenize(sentence)

    ts = TurkishStemmer()
    result = []
    for word in words:
        if word not in STOPWORD_LIST:
            if word in CUSTOMWORD_LIST:
                result.append(word)
            else:
                result.append(ts.stemWord(word))
    return result


In [260]:
# Gets the quesitons dictionary and returns dictionary of dictionaries 
# where subdictionaries consists of keys as each term and values as 
# number of occurencies.
# This version treats all quesitons in a category as a whole document.  
def TF(questions):
    result = {}
    for category in questions:
        result[category] = {}
        for question in questions[category]:
            term_arr = preProcessor(question)
            for term in term_arr:
                if term not in result[category]:
                    result[category][term] = 1
                else:
                    result[category][term] += 1
    return result


def normalizedTF(questions):
    result = {}
    tf = TF(questions)
    for category in tf:
        result[category] = {}
        no_of_terms = len(tf[category])
        for term in tf[category]:
            result[category][term] = tf[category][term] / no_of_terms
    return result

# Calculates inverse document frequency of a term
def IDF(term, questions):
    number_of_documents = len(questions)
    number_of_occurences = 0
    tf = TF(questions)
    for category in tf:
        # number_of_documents += len(tf) # counting all questions, not categories; this can be changed
        if term in tf[category]:
            number_of_occurences+=1
    if number_of_occurences == 0:
        return 0
    return 1 + np.log(number_of_documents / number_of_occurences)


def TFxIDF(terms_string, questions):
    result = {}
    normal_tf = normalizedTF(questions)
    terms = preProcessor(terms_string)

    for category in normal_tf:
        result[category] = {}
        for term in terms:
            idf = IDF(term, questions)
            if term in normal_tf[category]:
                result[category][term] = normal_tf[category][term] * idf
            else:
                result[category][term] = 0
    return result

# returns the most likely category as an heuristic for fasttext classifier
def categoryHeuristic(query, questions):
    tfidf = TFxIDF(query, questions)
    max_val = -1
    max_category = ""
    for category in tfidf:
        for term in tfidf[category]:
            if tfidf[category][term] > max_val:
                max_category = category
                max_val = tfidf[category][term]
    return max_category


In [261]:
def getQuestionVectors(ft, raw_questions):
    question_vectors = {}
    for key in raw_questions:
        question_vectors[key] = []
        for q in raw_questions[key]:
            question_vectors[key].append(ft.get_sentence_vector(q))
    return question_vectors

# adds a new field 'q_vectors' 
# that stores sentence vectors of each question.
# IMPORTANT: assign the result to QA constant array  
def NEWgetQuestionVectors(ft, questions_answers):
    for qa_pair in questions_answers:
        if "q_vectors" not in qa_pair:
            qa_pair["q_vectors"] = []
        for q in range(len(qa_pair["question"])):
            qa_pair["q_vectors"].append(ft.get_sentence_vector(qa_pair["question"][q]))
    return questions_answers  

In [262]:
def questionClassifier(ft, user_question, question_vectors, raw_questions):
    q_vector = ft.get_sentence_vector(user_question)

    most_similar_question = ""
    most_similar_category = ""

    max_similarity = 0

    tfidf_category = categoryHeuristic(user_question, raw_questions)

    for key in question_vectors:
        for vector, question in zip(question_vectors[key], raw_questions[key]):
            sim = cos_sim(q_vector, vector)
            if sim > max_similarity:
                max_similarity = sim
                most_similar_question = question
                most_similar_category = key

    if most_similar_category != tfidf_category:
        print("--MUHTEMEL YANLIS ANLAMALAR SOZ KONUSU--")
        print("TF.IDF kategori tahmini: %s" % tfidf_category)
        print("FastText kategori tahmini: %s" % most_similar_category)
        # search for the question under the tfidf heuristic category
        most_similar_question = ""
        most_similar_category = ""

        max_similarity = 0

        most_similar_category = tfidf_category
        for vector, question in zip(question_vectors[tfidf_category], raw_questions[tfidf_category]):
            sim = cos_sim(q_vector, vector)
            if sim > max_similarity:
                max_similarity = sim
                most_similar_question = question

    else:
        print("Soru kategorisi: %s" % most_similar_category)
        print("En yakin soru: %s" % most_similar_question)

    return (most_similar_category, most_similar_question)

def NEWquestionClassifier(ft, user_question, questions_answers):
    q_vector = ft.get_sentence_vector(user_question)

    most_similar_question = ""
    most_similar_category = ""
    most_similar_indice = 0

    max_similarity = 0

    for qa in questions_answers:
        for q in range(len(qa["q_vectors"])):
            sim = cos_sim(q_vector, qa["q_vectors"][q])
            if sim > max_similarity:
                max_similarity = sim
                most_similar_question = qa["question"][q]
                most_similar_category = qa["category"]
                most_similar_indice = questions_answers.index(qa)
    
    print("Benzerlik skoru: " + str(max_similarity))
    print("Soru kategorisi: %s" % most_similar_category)
    print("En yakin soru: %s" % most_similar_question)

    return most_similar_category, most_similar_question, most_similar_indice

In [263]:
def getAnswer(ft, user_question):
    global QUESTION_VECTORS

    if QUESTION_VECTORS=={}:
        QUESTION_VECTORS = getQuestionVectors(ft, QUESTIONS)
    category,question = questionClassifier(ft, user_question, question_vectors=QUESTION_VECTORS, raw_questions=QUESTIONS)
    ans = ANSWERS[category]
    return ans

def NEWgetAnswer(ft, user_question):
    global QA
    QA = NEWgetQuestionVectors(ft, QA)
    category, question, q_index = NEWquestionClassifier(ft, user_question, questions_answers=QA)
    ans = random.choice(QA[q_index]["answer"])
    return ans


In [264]:

print("cevap: " + NEWgetAnswer(ft, "kartımı bulamıyorum"))


Benzerlik skoru: 0.5836551
Soru kategorisi: kimlik kart
En yakin soru: Akıllı kimlik kartımı kaybettim/çalındı, ne yapmalıyım?
cevap: Akıllı kimlik kartınızı kaybettiğinizi/çalındığını fark ettiğinizde, cardinfo.metu.edu.tr adresinden kartınızı iptal etmelisiniz. Bu durumda akıllı kimlik kartınız sistem tarafından “iptal” edilir.
