In [9]:
import pandas as pd
from sqlalchemy import create_engine

df = pd.read_csv("./국토교통부_전국 법정동_20250415.csv")
filtered_df = df[df["시군구명"].notna() & df["읍면동명"].isna()]
filtered_df["시도코드"] = filtered_df["법정동코드"].astype(str).str[:2]
filtered_df["시군구코드"] = filtered_df["법정동코드"].astype(str).str[2:5]

final_df = filtered_df[["시도코드", "시군구코드", "시도명", "시군구명", "법정동코드"]]
keep_sido = ["서울특별시", "부산광역시", "제주특별자치도", "인천광역시"]
keep_sigungu = ["강릉시"]

final_df = filtered_df[
    (filtered_df["시도명"].isin(keep_sido)) |
    (filtered_df["시군구명"].isin(keep_sigungu))
]

final_df = final_df[["시도코드", "시군구코드", "시도명", "시군구명", "법정동코드"]]

final_df["시도_시군구"] = final_df["시도명"] + " " + final_df["시군구명"]

final_df=final_df[["시도코드", "시군구코드", "시도_시군구"]]
final_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["시도코드"] = filtered_df["법정동코드"].astype(str).str[:2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["시군구코드"] = filtered_df["법정동코드"].astype(str).str[2:5]


Unnamed: 0,시도코드,시군구코드,시도_시군구
1,11,110,서울특별시 종로구
94,11,140,서울특별시 중구
179,11,170,서울특별시 용산구
229,11,200,서울특별시 성동구
332,11,215,서울특별시 광진구
...,...,...,...
3713,28,720,인천광역시 옹진군
11496,42,150,강원도 강릉시
46295,50,110,제주특별자치도 제주시
46433,50,130,제주특별자치도 서귀포시


In [10]:
import pandas as pd
import numpy as np
import faiss
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


texts = final_df["시도_시군구"].tolist()

# OpenAI 임베딩 생성
embeddings = []
for text in texts:
    emb = client.embeddings.create(
        model="text-embedding-3-small",  # 혹은 text-embedding-3-large
        input=text
    )
    embeddings.append(emb.data[0].embedding)

embeddings = np.array(embeddings, dtype="float32")

# 메타데이터 준비
metadata = final_df[["시도코드", "시군구코드", "시도_시군구"]].to_dict(orient="records")

# FAISS 인덱스 생성
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

print(f"총 벡터 수: {index.ntotal}")

# 검색 함수
def search_region(query, top_k=1):
    q_emb = client.embeddings.create(
        model="text-embedding-3-small",
        input=query
    ).data[0].embedding
    q_emb = np.array([q_emb], dtype="float32")
    distances, indices = index.search(q_emb, top_k)
    return [metadata[i] for i in indices[0]]

# 테스트
print(search_region("서울 종로구"))

총 벡터 수: 61
[{'시도코드': '11', '시군구코드': '110', '시도_시군구': '서울특별시 종로구'}]


In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import psycopg2
import os

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# PostgreSQL 연결
conn = psycopg2.connect(
    host="localhost",
    dbname="postgres",
    user="postgres",
    password="1234",
    port=5432
)
cur = conn.cursor()

# 임베딩 생성 및 DB 저장
for _, row in final_df.iterrows():
    emb = client.embeddings.create(
        model="text-embedding-3-small",
        input=row["시도_시군구"]
    ).data[0].embedding
    
    cur.execute(
        """
        INSERT INTO region_embeddings (sido_code, sigungu_code, sido_sigungu, embedding)
        VALUES (%s, %s, %s, %s)
        """,
        (row["시도코드"], row["시군구코드"], row["시도_시군구"], emb)
    )

conn.commit()

print("모든 임베딩 저장 완료!")

# 검색 함수
def search_region_pg(query, top_k=1):
    # 질의어 임베딩 생성
    q_emb = client.embeddings.create(
        model="text-embedding-3-small",
        input=query
    ).data[0].embedding
    
    # pgvector 유사도 검색 (L2 거리)
    cur.execute(
        """
        SELECT sido_code, sigungu_code, sido_sigungu,
               embedding <-> %s AS distance
        FROM region_embeddings
        ORDER BY embedding <-> %s
        LIMIT %s
        """,
        (q_emb, q_emb, top_k)
    )
    return cur.fetchall()



OperationalError: connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?


NameError: name 'search_region_pg' is not defined

-- pgvector 확장 설치
CREATE EXTENSION IF NOT EXISTS vector;

-- 테이블 생성
CREATE TABLE region_embeddings (
    id SERIAL PRIMARY KEY,
    sido_code INT,
    sigungu_code INT,
    sido_sigungu TEXT,
    embedding vector(1536)  -- text-embedding-3-small 크기
);
