In [16]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
PINECONE_API_KEY = os.environ['PINECONE_API_KEY']

In [17]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

In [18]:
index_name = "wine-reviews"
for idx in pc.list_indexes():
    if idx.name == index_name:
        pc.delete_index(idx.name)

In [19]:
pc.create_index(
    name=index_name,
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "wine-reviews",
    "metric": "cosine",
    "host": "wine-reviews-147ea7t.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}

In [20]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [28]:
from pinecone import Pinecone
import os
from dotenv import load_dotenv

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index(os.getenv("PINECONE_INDEX_NAME"))

print(index.describe_index_stats())


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [29]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone
from openai import OpenAI

# .env 로드
load_dotenv()

# OpenAI & Pinecone 클라이언트 설정
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Pinecone Index 불러오기
index_name = os.getenv("PINECONE_INDEX_NAME")  # wine-reviews
index = pc.Index(index_name)

# 예시 데이터 (보통 CSV/JSON에서 읽어옴)
wine_reviews = [
    {"id": "1", "text": "A fresh Chardonnay with citrus and apple notes.", "metadata": {"wine": "Chardonnay", "country": "France"}},
    {"id": "2", "text": "A bold Cabernet Sauvignon with dark fruit and oak.", "metadata": {"wine": "Cabernet Sauvignon", "country": "USA"}},
]

# OpenAI 임베딩 생성 + Pinecone 업서트
vectors = []
for review in wine_reviews:
    embedding = client.embeddings.create(
        model="text-embedding-3-small",  # dimension 1536
        input=review["text"]
    ).data[0].embedding
    
    vectors.append({
        "id": review["id"],
        "values": embedding,
        "metadata": review["metadata"]
    })

# Pinecone에 업서트 (저장)
index.upsert(vectors=vectors)

print("업로드 완료!")


업로드 완료!


In [30]:
print(index.describe_index_stats())


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [31]:
{"id": "1001", 
 "text": "Aromas of ripe black cherry and spice. Medium-bodied with soft tannins.", 
 "metadata": {"wine": "Merlot", "country": "Italy", "points": 90}}


{'id': '1001',
 'text': 'Aromas of ripe black cherry and spice. Medium-bodied with soft tannins.',
 'metadata': {'wine': 'Merlot', 'country': 'Italy', 'points': 90}}

In [35]:
import os
import pandas as pd
from dotenv import load_dotenv
from pinecone import Pinecone
from openai import OpenAI
from tqdm import tqdm

# 1. 환경변수 로드
load_dotenv()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

index_name = os.getenv("PINECONE_INDEX_NAME")
index = pc.Index(index_name)

# 2. 데이터 불러오기 (CSV)
df = pd.read_csv("data/winemag-data-130k-v2.csv")

# 필요한 컬럼만 추출
# description: 리뷰 텍스트
# title: 와인 이름
# country: 생산 국가
# variety: 품종
# points: 점수
df = df[["description", "title", "country", "variety", "points"]].dropna()

# 3. 벡터 업로드 함수
def upload_batch(batch, batch_size=100):
    vectors = []
    for i, row in batch.iterrows():
        # OpenAI 임베딩 생성
        embedding = client.embeddings.create(
            model="text-embedding-3-small",  # dim=1536
            input=row["description"]
        ).data[0].embedding
        
        vectors.append({
            "id": str(i),  # row index를 id로 사용
            "values": embedding,
            "metadata": {
                "title": row["title"],
                "country": row["country"],
                "variety": row["variety"],
                "points": int(row["points"])
            }
        })
    
    # Pinecone에 업서트
    index.upsert(vectors=vectors)

# 4. 배치 단위 업로드 (tqdm으로 진행 상황 표시)
batch_size = 100  # GPU/네트워크 상황 따라 조정 가능
for start in tqdm(range(0, len(df), batch_size)):
    end = start + batch_size
    batch = df.iloc[start:end]
    upload_batch(batch, batch_size=batch_size)

print("✅ 모든 데이터 업로드 완료!")


100%|██████████| 1300/1300 [14:35:29<00:00, 40.41s/it]  

✅ 모든 데이터 업로드 완료!



