# 🧠 Vector Search: GitHub Issue Similarity Search with pgvector

In [None]:
# ✅ 1. 필요한 라이브러리 설치
!pip install -q sentence-transformers psycopg2-binary

# ✅ 2. 임베딩 모델 로드
from sentence_transformers import SentenceTransformer
import pandas as pd

model = SentenceTransformer('all-MiniLM-L6-v2')

# ✅ 3. GitHub Issue CSV 로드 및 임베딩 생성
df = pd.read_csv('github_issues_sample.csv')
df['embedding'] = df['description'].apply(lambda x: model.encode(x).tolist())
df.head()

# ✅ 4. PostgreSQL 연결 및 데이터 적재
import psycopg2

# 데이터베이스 연결 설정
conn = psycopg2.connect(
    dbname="your_db",
    user="your_user",
    password="your_password",
    host="your_host",
    port="5432"
)
cursor = conn.cursor()

# 테이블 생성 (pgvector 설치 필요)
cursor.execute("""
CREATE EXTENSION IF NOT EXISTS vector;
DROP TABLE IF EXISTS issues;
CREATE TABLE issues (
    id SERIAL PRIMARY KEY,
    title TEXT,
    description TEXT,
    embedding vector(384)
);
""")
conn.commit()

# 데이터 삽입
for _, row in df.iterrows():
    cursor.execute(
        "INSERT INTO issues (title, description, embedding) VALUES (%s, %s, %s)",
        (row['title'], row['description'], row['embedding'])
    )
conn.commit()

# ✅ 5. 유사 이슈 검색 쿼리
import numpy as np

query = "Upload fails when sending large files"
query_vec = model.encode(query).tolist()

cursor.execute(
    "SELECT id, title, description, embedding <#> %s AS similarity FROM issues ORDER BY similarity LIMIT 3",
    (query_vec,)
)
results = cursor.fetchall()

for r in results:
    print(f"Issue #{r[0]}: {r[1]} (similarity: {r[3]:.4f})\n{r[2]}\n")

cursor.close()
conn.close()