
#### Check requirements below before start:
requirements: 

    pip install pandas

    pip install torch
    
    pip install -U sentence_transformers
    
    pip install googletrans==4.0.0-rc1
    
## And Just run 'ALL of cells'!


### 1. Load model

In [1]:
import numpy as np
import torch

model = torch.load('model.pth') # 이미 생성된(저장된) 모델을 불러옴.
model.eval() ## 이걸 해야 evaluation mode로 진입함. 안 했을 경우 dropout처럼 traning단계에서 확률적으로 변하는 요소들이 반영되어 일관적인 아웃풋이 나오지 않음.

# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('sentence-transformers/stsb-bert-large')
# torch.save(model, 'model_large.pth')

# Sentences we want to encode. Example:
sentence = ['This framework generates embeddings for each input sentence']

print("-----------")
# Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)
print(f"shape:{embedding.shape}, type: {type(embedding)}")

embedding = model.encode(sentence, convert_to_tensor=True)
print(f"shape:{embedding.shape}, type: {type(embedding)}")

  from .autonotebook import tqdm as notebook_tqdm


-----------
shape:(1, 384), type: <class 'numpy.ndarray'>
shape:torch.Size([1, 384]), type: <class 'torch.Tensor'>


In [2]:
ExampleText1 = 'hello. Im a undergraduate student of computer science. and i have a question for Django framework'
ExampleText2 = 'I am trying to follow this tutorial using pip to install a python package locally. Per the tutorial in the bacnet-restful directory when I run pip install wheel I get this error:'
ExampleText3 = 'pip is a replacement for easy_install. But should I install pip using easy_install on Windows? Is there a better way?'

embedded_vector1 = model.encode([ExampleText1])[0] # 지금은 numpy로 불러오게 되는데, 옵션을 바꾸면 torch로 불러올 수도 있음.
embedded_vector2 = model.encode([ExampleText2])[0]
embedded_vector3 = model.encode([ExampleText3])[0]

# print(torch.norm(embedded_vector1), torch.norm(embedded_vector2), torch.norm(embedded_vector3))
def cos_theta(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
print("similaliry between 1 and 2:",cos_theta(embedded_vector1, embedded_vector2))
print("similaliry between 1 and 3:",cos_theta(embedded_vector1, embedded_vector3))
print("similaliry between 2 and 3:",cos_theta(embedded_vector2, embedded_vector3))

similaliry between 1 and 2: 0.2828502
similaliry between 1 and 3: 0.18575291
similaliry between 2 and 3: 0.6597288


### 2. Load StackOverflow Data & calculate model embeddings

In [3]:
# SOF_dbms.csv is a preprocessed stackoverflow csv file. The preprocess is in 'post_analysis.ipynb'

import pandas as pd
df = pd.read_csv('SOF_dbms.csv')
print(df.shape)
df.head()

(500, 5)


Unnamed: 0.1,Unnamed: 0,id,title,body,tags
0,0,70257495,PostgreSQL: How can I check if table was creat...,I want to upgrade PostgreSQL database to new v...,sql|postgresql
1,1,70081753,Check if value is not null and not empty in a ...,I have created a function in Postgresql and sp...,postgresql
2,2,70028788,Host for postgresql db hosted on GCloud,Given a postgresql database instance hosted in...,postgresql
3,3,70027637,Get rows from A table with multiple records wh...,"I have 2 tables in mariadb\nTable 1\nid,did,st...",mysql
4,4,70113318,Multiple MySQL authentification options,"I, I was wondering if it was possible to use m...",mysql


In [4]:
import pickle, os
from utils import gen_embedding
title = [text for text in df['title']]
body = [text for text in df['body']]
title_embedding = gen_embedding(title, 'SO_title.pickle')
body_embedding = gen_embedding(body, 'SO_body.pickle')

print("title_shape:", title_embedding.shape)
print("body_shape:", body_embedding.shape)

SO_title.pickle already exist. Loaded fileLoc
SO_body.pickle already exist. Loaded fileLoc
title_shape: torch.Size([500, 384])
body_shape: torch.Size([500, 384])


### 3. Generate Hash function, and calculate the hash value of embeddings

In [5]:
import yaml, pickle
from mips_ALSH import Mips, HashFt, Hash_Table
from utils import gen_DataTable
with open('config.yaml', 'r') as f:
    params = yaml.load(f, Loader = yaml.FullLoader)

hashft_class = HashFt(params)
hashft = hashft_class.hash_functions

title_Data = gen_DataTable(title_embedding, 'SO_title_dataTable.pickle', hashft)
body_Data = gen_DataTable(body_embedding, 'SO_body_dataTable.pickle', hashft)


search_engine = Mips(hashft, params)


SO_title_dataTable.pickle already exist. Loaded SO_title_dataTable.pickle
SO_body_dataTable.pickle already exist. Loaded SO_body_dataTable.pickle


In [6]:
text = 'postgresql'
ranking, sorted = search_engine.search(text, model.encode([text], convert_to_tensor=True), title_Data, body_Data)
print("\n------------------------")
print("ranking index: ",ranking)
print()
for i in range(len(ranking)):
    print(sorted[-i-1], " : ", df.iloc[int(ranking[-i-1]), 1], ":", df.iloc[int(ranking[-i-1]), 2])
    print()


------------------------
ranking index:  tensor([329, 450,  82, 147,  63, 123,  31, 141, 447,  26, 284, 330, 430, 272,
        270, 230, 177, 274,  40, 487,  79, 378, 387, 370,  90, 441, 379, 325,
        472, 349, 445, 309, 275, 443,  87, 297, 236, 398, 227, 103, 102,  38,
        420, 463, 421, 126, 442, 428,  98, 204])

tensor(2179)  :  70262470 : All unique combinations of a column based on another one

tensor(2173)  :  70146079 : Count removes all but one value, instead of showing one of each value, and distinct replaces names with '0'

tensor(2172)  :  70063931 : calculate date in 1 min interval postgres

tensor(2171)  :  70124780 : Get black friday dates in Postgres

tensor(2171)  :  70366959 : Use many call apoc.do.when

tensor(2168)  :  70337480 : Need MYSQL uquery solution

tensor(2167)  :  70253133 : Convert oracle pivot to Postgres query

tensor(2165)  :  70313183 : Join with aggregate function on empty resultset

tensor(2165)  :  70190330 : In Postgres 10 how to roll mult

In [7]:
from utils import ranking_postprocess
ranking_dict = {"/detail/"+str(df['id'][int(ranking[-i-1])]):df['title'][int(ranking[-i-1])] for i in range(len(ranking))} # Not Determined Yet.
re_ranking = ranking_postprocess(text, ranking_dict, params['ranking_size'])
print("\n------------------------")
print()
for key, val in re_ranking.items():
    print("key:", key)
    print("val:", val)
    print()


------------------------

key: /detail/70233870
val: custom field field in postgreSQL

key: /detail/70244373
val: PostgreSQL select IN with duplicate

key: /detail/70342055
val: How to share a postgreSQL database?

key: /detail/70094882
val: PostgreSQL array_agg returns null if first param is null

key: /detail/70076332
val: Is there a way to improve BigQuery's performance over PostgreSQL



In [16]:
import googletrans
translator = googletrans.Translator()

Kor_text = '''
Neo4j 실행 오류
'''

translated_text = translator.translate(Kor_text, dest='en')
print("Kor2Eng:", translated_text.text)

Eng_text = translated_text.text
translated_text = translator.translate(Eng_text, dest='en')
print("Eng2Eng:", translated_text.text)

Kor2Eng: NEO4J execution error
Eng2Eng: NEO4J execution error


In [17]:
ranking, sorted = search_engine.search(translated_text.text, model.encode([translated_text.text], convert_to_tensor=True), title_Data, body_Data)
ranking_dict = {"/detail/"+str(df['id'][int(ranking[-i-1])]):df['title'][int(ranking[-i-1])] for i in range(len(ranking))} # Not Determined Yet.
re_ranking = ranking_postprocess(text, ranking_dict, params['ranking_size'])
print("\n------------------------")
print()
for key, val in re_ranking.items():
    print("val:", val)
    print()


------------------------

key: /detail/70217899
val: Postgresql Create Type

key: /detail/70207577
val: PostgreSQL query for tvshows with multiple genres

key: /detail/70094882
val: PostgreSQL array_agg returns null if first param is null

key: /detail/70186922
val: How to combine a date and string in Postgresql

key: /detail/70287201
val: Automatically set SUM calculation's result as column value during / after row insert



In [10]:
a = torch.rand(size = (3, 4))
print(a)
maxnorm = torch.max(torch.norm(a, dim = 1))

print(torch.norm(a/maxnorm, dim = 1))

b = a/maxnorm

added = torch.Tensor([[0.5-torch.norm(v)**(2**i) for i in range(1, 60+1)] for v in b])
        
expanded = torch.concat([b, added], dim = 1)

print(torch.norm(expanded, dim = 1))
      

tensor([[0.1361, 0.3817, 0.7655, 0.5168],
        [0.3004, 0.4099, 0.1195, 0.2574],
        [0.4602, 0.8081, 0.1052, 0.3067]])
tensor([1.0000, 0.5771, 0.9764])
tensor([3.8730, 3.8730, 3.8730])
