# Implementing IR Model

### Setup commonly used function and constraint

In [1]:
import pickle
import numpy as np
import os
import subprocess
import time 

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    r = {}
    with open(filename, 'rb') as f:
        r = pickle.load(f)
    return r

sentenceVectorFile = 'senVec.txt'
sentenceDictFile = 'sen2vec.pkl'
tokenizeFile = 'tokenized_out.txt'
sentencesFile = 'sentences.txt'

### Prepare memory

In [2]:
def getSentenceVector(file = None, text = None, Format = True):
    args = ["/data2/fasttext/fasttext", "print-sentence-vectors", "/data2/cc.th.300.bin"]
    
    if(text != None):
        popen = subprocess.Popen(args,stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        output = popen.communicate(text.encode())[0]
        popen.kill()
        return np.array([line.split(' ')[:-1] for line in output.decode('utf8').split('\n')[:-1]], dtype = np.float)
    
    elif(file != None):
        f = open(file)
        o = open(sentenceVectorFile, 'w')
        popen = subprocess.Popen(args,stdin=f, stdout=o)   
        popen.wait()
#         output = popen.stdout.read()
        f.close()
        o.close()
        popen.kill()
#         if(Format):
#             return np.array([line.split(' ')[:-1] for line in output.decode('utf8').split('\n')[:-1]])
#         else:
#             return output

In [3]:
from pythainlp.tokenize import word_tokenize

def sentenceTokenize(inputSentence):
    # Tokenize
    tokenized = word_tokenize(inputSentence)
    newTokenize = []
    for w in tokenized:
        newTokenize += word_tokenize(w, engine='newmm')
    return " ".join(newTokenize)

In [4]:
def prepare_memory():
    sentences = []
    with open(tokenizeFile, 'r') as fp:
        for idx, line in enumerate(fp):
            sentences.append(" ".join(line.strip().split('|')))

    with open(sentencesFile, 'w') as fp:
        for idx, sen in enumerate(sentences):
            if idx%2 != 0:
                fp.write("{}\n".format(sen))
  
    getSentenceVector(file = sentencesFile)

In [5]:
%%time
prepare_memory()

CPU times: user 12.8 ms, sys: 8.31 ms, total: 21.1 ms
Wall time: 10.1 s


### Define prepare_model function
By checking if file at 'sentenceDictFile' variable exist or not. If not it will create such file, otherwise it will load from file. The output of this function is sentenceDatabase which is a numpy array with dimension of length of all sentence in database by 300, each row is a sentence vector from precompiled fastText. Next is sen2vec is like sentenceDatabase but instead of index of number, the index is the sentence itself. And lastly idx2sen which connected the gap between the two previous mentioned variables.

In [6]:
def prepare_model():
    sentencesTokenized = []
    with open(tokenizeFile, 'r') as file:
        for idx,line in enumerate(file.read().splitlines()):
            if(idx%2 == 1):
                sentencesTokenized.append("".join(line.strip().split("|")))
    
    if os.path.isfile(sentenceDictFile):
        sen2vec = load_object(sentenceDictFile)
        idx2sen = {}
        for idx, sen in enumerate(sen2vec):
            idx2sen[idx] = sen
        
        sentenceDatabase = np.zeros((len(sen2vec), 300))
        for i in range(len(sen2vec)):
            for j in range(300):
                sentenceDatabase[i][j] = sen2vec[idx2sen[i]][j]
        return sentenceDatabase, idx2sen, sen2vec
    else:
        sen2vec = {}
        with open(sentenceVectorFile, 'r') as file:
            for idx, line in enumerate(file):
                vector = line.strip().split(' ')[-300:]
                sentence = sentencesTokenized[idx]
                sen2vec[sentence] = list(map(lambda x: float(x), vector))
        save_object(sen2vec, sentenceDictFile)
        idx2sen = {}
        for idx, sen in enumerate(sen2vec):
            idx2sen[idx] = sen
        
        sentenceDatabase = np.zeros((len(sen2vec), 300))
        for i in range(len(sen2vec)):
            for j in range(300):
                sentenceDatabase[i][j] = sen2vec[idx2sen[i]][j]
        return sentenceDatabase, idx2sen, sen2vec

In [7]:
with open('/data2/test.txt') as f:
    for i,e in enumerate(f.read().splitlines()):
        print(i,e)

0 สวัสดี
1 ครับ


### Define talkVec function
This function compare inputSentenceVector (expected to be np array with dimension of (300, )) with rest of sentenceDatabase using consine similarity, and output the cloest sentence in database.

In [8]:
def talkVec(sentenceDatabase, idx2sen, sen2vec, inputSentenceVector):
    inputAb = np.linalg.norm(inputSentenceVector,ord=1)
    output = sentenceDatabase.dot(inputSentenceVector)
    for i in range(sentenceDatabase.shape[0]):
        output[i] /= (np.linalg.norm(sentenceDatabase[i], ord=1))*inputAb
    sumAll = np.sum(output)
    output = output/sumAll
    outIdx = np.argmax(output)
    return idx2sen[outIdx]

### Define main function
As of right now. this is for testing only

In [11]:
%%time
def main():
    print('preparing...')
    start = time.time()
    prepare_memory()
    sentenceDatabase, idx2sen, sen2vec = prepare_model()
    print('elaped', time.time() - start)
    while(True):
        print('>', end= ' ')
#     text = "ทำงาน database ยัง"
        text = input()
        if(text == ''):
            break
        start = time.time()
        text = sentenceTokenize(text)
        sent_vec = getSentenceVector(text = text)[0]
        print(talkVec(sentenceDatabase, idx2sen, sen2vec, sent_vec))
        print("elasped", time.time() - start)

main()

preparing...
elaped 9.72899580001831
> แก้ม
หาเรื่องพี่แก้ม เดะมีสวยยย
elasped 9.233753442764282
> แก้ม ทำไรอยู่
อวาตาร์555 ไม่รู้จะทำไง คือหน้าrepo จะต้องทำให้ดึงมาจากดาต้าเบส โชวดีไวส์เป็นช่องๆ (เป็นrowอ่ะ)
elasped 9.284680366516113
> ทำอะไีอยู่อะ
ชั่ยแล้น มันมีสยามกะเซนเวิล อ่อมีเทอมินอลด้วย
elasped 9.241783857345581
> github
/home/aroundy/Documents/wongnai_rating1.csv: No such file or directory ERROR: (gcloud.compute.scp) [/usr/bin/scp] exited with return code [1]. =____= คือชั้นต้องสร้างไฟล์มารองก่อนเรอะ
elasped 9.224480152130127
> train model
อ่อ ก็ว่ามันพัง 555 พังอยู่ดี output มันต้องเป็นอะไรอ่ะ ValueError: Error when checking target: expected dense_4 to have shape (None, 5) but got array with shape (28000, 1)
elasped 9.222252130508423
> พน มีเรียนอะไรบ้าง
Nsc จารย์ให้รื้อuiละก็ทำไรเพิ่มเยอะอยู่ คือส่ง14 แต่อจขอดูศุกร์นี้ก่อน
elasped 9.277666330337524
> แก้ม พนเรียนอะไรอะ
อวาตาร์555 ไม่รู้จะทำไง คือหน้าrepo จะต้องทำให้ดึงมาจากดาต้าเบส โชวดีไวส์เป็นช่องๆ (เป็นrowอ่ะ)
elasped 9.4

# Implementing another IR Model

In [12]:
sentenceHumanVectorFile = 'senHumanVec.txt'
sentenceHumanDictFile = 'senHuman2vec.pkl'

In [15]:
def prepare_model_human():
    if os.path.isfile(sentenceHumanDictFile):
        senHuman2vec = load_object(sentenceHumanDictFile)
        idx2senHuman = {}
        for idx, sen in enumerate(senHuman2vec):
            idx2senHuman[idx] = sen
        
        sentenceHumanDatabase = np.zeros((len(senHuman2vec), 300))
        for i in range(len(senHuman2vec)):
            for j in range(300):
                sentenceHumanDatabase[i][j] = senHuman2vec[idx2senHuman[i]][j]
        return sentenceHumanDatabase, idx2senHuman, senHuman2vec
    else:
        senHuman2vec = {}
        with open(sentenceHumanVectorFile, 'r') as file:
            for idx, line in enumerate(file):
                sentence = line.strip().split(' ')
                vector = sentence[-300:]
                sentence = "".join(sentence[:len(sentence) - 300])
                senHuman2vec[sentence] = list(map(lambda x: float(x), vector))
        save_object(senHuman2vec, sentenceHumanDictFile)
        
        idx2senHuman = {}
        for idx, sen in enumerate(senHuman2vec):
            idx2senHuman[idx] = sen
        
        sentenceHumanDatabase = np.zeros((len(senHuman2vec), 300))
        for i in range(len(senHuman2vec)):
            for j in range(300):
                sentenceHumanDatabase[i][j] = senHuman2vec[idx2senHuman[i]][j]
        return sentenceHumanDatabase, idx2senHuman, senHuman2vec

In [13]:
def talkHmanVec(sentenceHumanDatabase, idx2senHuman, senHuman2vec, idx2sen, inputSentenceVector):
    inputAb = np.linalg.norm(inputSentenceVector,ord=1)
    output = sentenceHumanDatabase.dot(inputSentenceVector)
    for i in range(sentenceHumanDatabase.shape[0]):
        output[i] /= (np.linalg.norm(sentenceHumanDatabase[i], ord=1))*inputAb
    sumAll = np.sum(output)
    output = output/sumAll
    outIdx = np.argmax(output)
    return idx2sen[outIdx+1]

In [16]:
def main():
    sentenceDatabase, idx2sen, sen2vec = prepare_model()
    sentenceHumanDatabase, idx2senHuman, senHuman2vec = prepare_model_human()
    testIdx = 300
    print(idx2sen[testIdx])
    print(talkHmanVec(sentenceDatabase, idx2sen, sen2vec, idx2sen, np.array(sen2vec[idx2sen[testIdx]])))

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'senHumanVec.txt'