# Implementing IR Model

### Setup commonly used function and constraint

In [1]:
import pickle
import numpy as np
import os

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    r = {}
    with open(filename, 'rb') as f:
        r = pickle.load(f)
    return r

sentenceVectorFile = 'senVec.txt'
sentenceDictFile = 'sen2vec.pkl'

### Define prepare_model function
By checking if file at 'sentenceDictFile' variable exist or not. If not it will create such file, otherwise it will load from file. The output of this function is sentenceDatabase which is a numpy array with dimension of length of all sentence in database by 300, each row is a sentence vector from precompiled fastText. Next is sen2vec is like sentenceDatabase but instead of index of number, the index is the sentence itself. And lastly idx2sen which connected the gap between the two previous mentioned variables.

In [2]:
def prepare_model():
    if os.path.isfile(sentenceDictFile):
        sen2vec = load_object(sentenceDictFile)
        idx2sen = {}
        for idx, sen in enumerate(sen2vec):
            idx2sen[idx] = sen
        
        sentenceDatabase = np.zeros((len(sen2vec), 300))
        for i in range(len(sen2vec)):
            for j in range(300):
                sentenceDatabase[i][j] = sen2vec[idx2sen[i]][j]
        return sentenceDatabase, idx2sen, sen2vec
    else:
        sen2vec = {}
        with open(sentenceVectorFile, 'r') as file:
            for idx, line in enumerate(file):
                sentence = line.strip().split(' ')
                vector = sentence[-300:]
                sentence = "".join(sentence[:len(sentence) - 300])
                sen2vec[sentence] = list(map(lambda x: float(x), vector))
        save_object(sen2vec, sentenceDictFile)
        idx2sen = {}
        for idx, sen in enumerate(sen2vec):
            idx2sen[idx] = sen
        
        sentenceDatabase = np.zeros((len(sen2vec), 300))
        for i in range(len(sen2vec)):
            for j in range(300):
                sentenceDatabase[i][j] = sen2vec[idx2sen[i]][j]
        return sentenceDatabase, idx2sen, sen2vec

### Define talkVec function
This function compare inputSentenceVector (expected to be np array with dimension of (300, )) with rest of sentenceDatabase using consine similarity, and output the cloest sentence in database.

In [3]:
def talkVec(sentenceDatabase, idx2sen, sen2vec, inputSentenceVector):
    inputAb = np.linalg.norm(inputSentenceVector,ord=1)
    output = sentenceDatabase.dot(inputSentenceVector)
    for i in range(sentenceDatabase.shape[0]):
        output[i] /= (np.linalg.norm(sentenceDatabase[i], ord=1))*inputAb
    sumAll = np.sum(output)
    output = output/sumAll
    outIdx = np.argmax(output)
    print(outIdx)
    return idx2sen[outIdx]

### Define main function
As of right now. this is for testing only

In [4]:
def main():
    sentenceDatabase, idx2sen, sen2vec = prepare_model()
    testIdx = 10
    print(idx2sen[testIdx])
    print(talkVec(sentenceDatabase, idx2sen, sen2vec, np.array(sen2vec[idx2sen[testIdx]])))

if __name__ == "__main__":
    main()

วันนี้ว่างหมาย555แต่อยากชวนมาบ้านสเตลล่าชวนทำเค้ก555
10
วันนี้ว่างหมาย555แต่อยากชวนมาบ้านสเตลล่าชวนทำเค้ก555
