In [1]:
import fasttext
import fasttext.util
import json
# fasttext.util.download_model('tr', if_exists='ignore')

In [23]:
import numpy as np
from numpy.linalg import norm
from scipy import spatial

In [3]:
SIM_THRESHOLD = 0.7

In [36]:
cos_sim = lambda q_vector, vector : np.dot(q_vector, vector)/(norm(q_vector)*norm(vector))

In [31]:
# tf idf utility functions

# preprocessor for quesiton strings to transform strings to word arrays. 
# can be replaced with a better one (zemberek)
def preProcessor(sentence):
    return sentence.split()

# Gets the quesitons dictionary and returns dictionary of dictionaries 
# where subdictionaries consists of keys as each term and values as 
# number of occurencies.
# This version treats all quesitons in a category as a whole document.  
def TF(questions):
    result = {}
    for category in questions:
        result[category] = {}
        for question in questions[category]:
            term_arr = preProcessor(question)
            for term in term_arr:
                if term not in result[category]:
                    result[category][term] = 1
                else:
                    result[category][term] += 1
    return result

def normalizedTF(questions):
    result = {}
    tf = TF(questions)
    for category in tf:
        result[category] = {}
        no_of_terms = len(tf[category])
        for term in tf[category]:
            result[category][term] = tf[category][term] / no_of_terms
    return result

# Calculates inverse document frequency of a term
def IDF(term, questions):
    number_of_documents = len(questions)
    number_of_occurences = 0
    tf = TF(questions)
    for category in tf:
        if term in tf[category]:
            number_of_occurences+=1
    if number_of_occurences == 0:
        return 0
    return 1 + np.log(number_of_documents / number_of_occurences)


def TFxIDF(terms_string, questions):
    result = {}
    normal_tf = normalizedTF(questions)
    terms = preProcessor(terms_string)

    for category in normal_tf:
        result[category] = {}
        for term in terms:
            idf = IDF(term, questions)
            if term in normal_tf[category]:
                result[category][term] = normal_tf[category][term] * idf
            else:
                result[category][term] = 0
    return result

In [12]:
ft = fasttext.load_model(r'cc.tr.300.bin')

In [35]:
f = open("./question_categories.json")
questions = json.load(f)

print(TFxIDF("wifi şifremi unuttum", questions))

{'sifre': {'wifi': 0, 'şifremi': 0.3147918433002165, 'unuttum': 0.10493061443340551}, 'lisanslı yazılımlar': {'wifi': 0, 'şifremi': 0, 'unuttum': 0}, 'wifi': {'wifi': 0.32286342902586307, 'şifremi': 0, 'unuttum': 0}}


In [6]:
question_vectors = {}
for key in questions:
    question_vectors[key] = []
    for q in questions[key]:
        question_vectors[key].append(ft.get_sentence_vector(q))

In [7]:
user_question = input("Nasıl yardımcı olabilirim?")
q_vector = ft.get_sentence_vector(user_question)

most_similar_question = ""
most_similar_category = ""

max_similarity = 0
category_max_similarity = 0

for key in question_vectors:
    for vector, question in zip(question_vectors[key], questions[key]):
        sim = cos_sim(q_vector, vector)
        if sim > max_similarity:
            max_similarity = sim
            most_similar_question = question
            most_similar_category = key

print("Soru kategorisi: %s" % most_similar_category) 
print("En yakın soru: %s" % most_similar_question)

Soru kategorisi: wifi
En yakın soru: eduroam şifresi neydi
