# 🧠 Vector Search + RAG + FastAPI + PCA 시각화
GitHub 이슈를 벡터화하고 AI 기반 유사 이슈를 추천하는 실습

In [None]:

!pip install -q sentence-transformers psycopg2-binary fastapi uvicorn openai matplotlib scikit-learn


In [None]:

from sentence_transformers import SentenceTransformer
import pandas as pd

model = SentenceTransformer('all-MiniLM-L6-v2')
df = pd.read_csv('github_issues_sample.csv')
df['embedding'] = df['description'].apply(lambda x: model.encode(x).tolist())
df.head()


In [None]:

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

X = df['embedding'].tolist()
pca = PCA(n_components=2)
reduced = pca.fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(reduced[:,0], reduced[:,1])
for i, title in enumerate(df['title']):
    plt.text(reduced[i,0]+0.02, reduced[i,1]+0.02, title[:20], fontsize=9)
plt.title("GitHub Issue Embedding (PCA 2D)")
plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.grid(True)
plt.show()


In [None]:

import psycopg2

conn = psycopg2.connect(
    dbname="your_db",
    user="your_user",
    password="your_password",
    host="your_host",
    port="5432"
)
cursor = conn.cursor()

cursor.execute("""
CREATE EXTENSION IF NOT EXISTS vector;
DROP TABLE IF EXISTS issues;
CREATE TABLE issues (
    id SERIAL PRIMARY KEY,
    title TEXT,
    description TEXT,
    embedding vector(384)
);
""")
conn.commit()

for _, row in df.iterrows():
    cursor.execute(
        "INSERT INTO issues (title, description, embedding) VALUES (%s, %s, %s)",
        (row['title'], row['description'], row['embedding'])
    )
conn.commit()
cursor.close()
conn.close()


In [None]:

from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import psycopg2

app = FastAPI()

class Query(BaseModel):
    text: str

@app.post("/search")
def search(query: Query):
    vector = model.encode(query.text).tolist()
    conn = psycopg2.connect(
        dbname="your_db",
        user="your_user",
        password="your_password",
        host="your_host",
        port="5432"
    )
    cur = conn.cursor()
    cur.execute(
        "SELECT id, title, description, embedding <#> %s AS similarity FROM issues ORDER BY similarity LIMIT 3",
        (vector,)
    )
    rows = cur.fetchall()
    cur.close()
    conn.close()
    return {"results": [{"id": r[0], "title": r[1], "description": r[2], "similarity": r[3]} for r in rows]}

# 로컬에서 실행 시
# uvicorn.run(app, host="0.0.0.0", port=8000)


In [None]:

import openai

# 예시로 직접 검색 결과를 설정
query_text = "Upload fails on large file"
query_vec = model.encode(query_text).tolist()

conn = psycopg2.connect(
    dbname="your_db",
    user="your_user",
    password="your_password",
    host="your_host",
    port="5432"
)
cur = conn.cursor()
cur.execute(
    "SELECT title, description FROM issues ORDER BY embedding <#> %s LIMIT 3", (query_vec,)
)
top_issues = cur.fetchall()
cur.close()
conn.close()

context = "\n".join([f"- {title}: {desc}" for title, desc in top_issues])

# RAG 응답 생성
response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": "You are an AI assistant for software issue analysis."},
        {"role": "user", "content": f"New issue: {query_text}\nRelated issues:\n{context}\nWhat can I do?"}
    ]
)

print(response['choices'][0]['message']['content'])
