
#### Check requirements below before start:
requirements: 

    pip install pandas

    pip install torch
    
    pip install -U sentence_transformers
    
    pip install googletrans==4.0.0-rc1
    
## And Just run 'ALL of cells'!


### 1. Load model

In [1]:
import numpy as np
import torch

model = torch.load('model_large.pth') # 이미 생성된(저장된) 모델을 불러옴.
model.eval() ## 이걸 해야 evaluation mode로 진입함. 안 했을 경우 dropout처럼 traning단계에서 확률적으로 변하는 요소들이 반영되어 일관적인 아웃풋이 나오지 않음.

# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('sentence-transformers/stsb-bert-large')
# torch.save(model, 'model_large.pth')

# Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']

print("-----------")
# Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)
print(f"shape:{embedding.shape}, type: {type(embedding)}")

embedding = model.encode(sentence, convert_to_tensor=True)
print(f"shape:{embedding.shape}, type: {type(embedding)}")

  from .autonotebook import tqdm as notebook_tqdm


-----------
shape:(1, 1024), type: <class 'numpy.ndarray'>
shape:torch.Size([1, 1024]), type: <class 'torch.Tensor'>


In [2]:
ExampleText1 = 'hello. Im a undergraduate student of computer science. and i have a question for Django framework'
ExampleText2 = 'I am trying to follow this tutorial using pip to install a python package locally. Per the tutorial in the bacnet-restful directory when I run pip install wheel I get this error:'
ExampleText3 = 'pip is a replacement for easy_install. But should I install pip using easy_install on Windows? Is there a better way?'

embedded_vector1 = model.encode([ExampleText1])[0] # 지금은 numpy로 불러오게 되는데, 옵션을 바꾸면 torch로 불러올 수도 있음.
embedded_vector2 = model.encode([ExampleText2])[0]
embedded_vector3 = model.encode([ExampleText3])[0]

# print(torch.norm(embedded_vector1), torch.norm(embedded_vector2), torch.norm(embedded_vector3))
def cos_theta(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
print("similaliry between 1 and 2:",cos_theta(embedded_vector1, embedded_vector2))
print("similaliry between 1 and 3:",cos_theta(embedded_vector1, embedded_vector3))
print("similaliry between 2 and 3:",cos_theta(embedded_vector2, embedded_vector3))

similaliry between 1 and 2: 0.3606457
similaliry between 1 and 3: 0.13557953
similaliry between 2 and 3: 0.53019726


### 2. Load StackOverflow Data & calculate model embeddings

In [3]:
# SOF_dbms.csv is a preprocessed stackoverflow csv file. The preprocess is in 'post_analysis.ipynb'

import pandas as pd
df = pd.read_csv('SOF_dbms copy.csv')
print(df.shape)
df.head()

(9161, 4)


Unnamed: 0.1,Unnamed: 0,id,title,body
0,0,69783030,"""ATAL: password authentication failed for user...",I install postgres12 with pgadmin 4 on windows...
1,1,25370769,"""Backend Error"" when load to BigQuery table","It appears this has been a common issue, it ha..."
2,2,51301363,"""CAST"" function with ""DISTINCT ON"" not changin...",I have two tables parent and child . I need to...
3,3,52036705,"""CLUSTER BY expression must be groupable, but ...",I've created a table using the web UI as:<pre>...
4,4,67614474,"""CREATE USER PASSWORD"" vs ""CREATE USER WITH PA...","Based on <a href=""https://www.postgresql.org/d..."


In [4]:
import pickle, os
from utils import gen_embedding
title = [text for text in df['title']]
body = [text for text in df['body']]
title_embedding = gen_embedding(title, 'SO_title.pickle')
body_embedding = gen_embedding(body, 'SO_body.pickle')

print("title_shape:", title_embedding.shape)
print("body_shape:", body_embedding.shape)

### 3. Generate Hash function, and calculate the hash value of embeddings

In [None]:
import yaml, pickle
from mips_ALSH import Mips, HashFt, Hash_Table
from utils import gen_DataTable
with open('config.yaml', 'r') as f:
    params = yaml.load(f, Loader = yaml.FullLoader)

hashft_class = HashFt(params)
hashft = hashft_class.hash_functions

title_Data = gen_DataTable(title_embedding, 'SO_title_dataTable.pickle', hashft)
body_Data = gen_DataTable(body_embedding, 'SO_body_dataTable.pickle', hashft)


search_engine = Mips(hashft, params)


SO_title_dataTable.pickle already exist. Loaded fileLoc
SO_body_dataTable.pickle already exist. Loaded fileLoc


In [None]:
text = 'postgresql'
ranking, sorted = search_engine.search(text, model.encode([text], convert_to_tensor=True), title_Data, body_Data)
print("\n------------------------")
print("ranking index: ",ranking)
print()
for i in range(len(ranking)):
    print(sorted[-i-1], " : ", df.iloc[int(ranking[-i-1]), 1], ":", df.iloc[int(ranking[-i-1]), 2])
    print()


------------------------
ranking index:  tensor([4580, 5307, 3688, 8225, 6784])

tensor(1097)  :  68471431 : How to change .csv file from one folder to another after import to postgreSQL?

tensor(1092)  :  60642774 : How to pass a value from Before Trigger to After Trigger

tensor(1092)  :  50167517 : Did Postgresq vacuum execute when performing JDBC transaction?

tensor(1090)  :  71759507 : Google BigQuery -Revert to the old table view in BigQuery WEB UI?

tensor(1090)  :  31993896 : Failure on CSV import into Neo4j 2.2.4 using neo4j-import



In [None]:
import googletrans
translator = googletrans.Translator()

Kor_text = '''안녕하세요. 빅쿼리를 디비버에 연결하려는데 엑세스 문제가 있어서 문의 드립니다. 편집자로만 역할을 부여했을 때에도 엑세스 문의가 있어서 새로운 프로젝트를 만들고 소유자와 편집자 역할 모두 부여했는데도 오류코드를 받았습니다.
어떻게 해야 연결할 수 있을까요?
확인 부탁드립니다ㅠㅠ
감사합니다.'''

translated_text = translator.translate(Kor_text, dest='en')
print("Kor2Eng:", translated_text.text)

Eng_text = translated_text.text
translated_text = translator.translate(Eng_text, dest='en')
print("Eng2Eng:", translated_text.text)

Kor2Eng: hello.I want to connect the big query to Diviber, so I have an access problem.When I was only an editor, I had access inquiries, so I created a new project and received both the owner and the role of the editor.
How can I connect?
Please confirm it.
thank you
Eng2Eng: hello.I want to connect the big query to Diviber, so I have an access problem.When I was only an editor, I had access inquiries, so I created a new project and received both the owner and the role of the editor.
How can I connect?
Please confirm it.
thank you


In [None]:
ranking, sorted = search_engine.search(translated_text.text, model.encode([translated_text.text], convert_to_tensor=True), title_Data, body_Data)
print("\n------------------------")
print("ranking index: ",ranking)
print()
for i in range(len(ranking)):
    print(df.iloc[int(ranking[-i]), 1], ":", df.iloc[int(ranking[-i]), 2])
    # print(df.iloc[int(ranking[-i]), 3])
    print()


------------------------
ranking index:  tensor([2966, 5049, 2235, 7860, 8047])

38387679 : Convert interval to microseconds as number type in PostgreSQL?

15359029 : How to list the train operators that use the second oldest trains (PostgreSQL)

50247890 : How to implement PRAGMA EXCEPTION_INIT in Postgres?

58289672 : Can I upgrade PostgreSql 10 to PostgreSql 12

38406184 : Get each value's difference from overall min value in Postgres



In [None]:
a = torch.rand(size = (3, 4))
print(a)
maxnorm = torch.max(torch.norm(a, dim = 1))

print(torch.norm(a/maxnorm, dim = 1))

b = a/maxnorm

added = torch.Tensor([[0.5-torch.norm(v)**(2**i) for i in range(1, 60+1)] for v in b])
        
expanded = torch.concat([b, added], dim = 1)

print(torch.norm(expanded, dim = 1))
      

tensor([[0.2649, 0.9184, 0.9000, 0.6577],
        [0.8096, 0.6223, 0.3485, 0.1301],
        [0.0627, 0.4490, 0.7404, 0.2592]])
tensor([1.0000, 0.7401, 0.6170])
tensor([4.0000, 3.8730, 3.8730])
