In [None]:
!pip install -qU openai pinecone-client datasets

In [None]:
import os, csv
from openai import OpenAI
from tqdm.autonotebook import tqdm
import pinecone
from dotenv import load_dotenv

# .envファイルを読み込む
load_dotenv()


# get API key for OpenAI and pinecone
client = OpenAI(
  api_key=os.getenv("OPENAI_API_KEY"),  # this is also the default, it can be omitted
)
pinecone_api_key = os.getenv("PINECONE_API_KEY")
print(client.api_key, pinecone_api_key)

In [None]:
# csvに落とし込む
csv_path = '../data/data.csv'
raw_data = []

with open(csv_path, mode='r', encoding='UTF-8', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        raw_data.append(row[0])

# 読み取ったデータを表示
for row in raw_data:
    print(len(row), row)

print(raw_data[0])

In [3]:
# openaiで使用するmodelを宣言
embed_model = "EMBED_MODEL"

In [8]:
# 1行に対して、Embeddingを作成してみる
data = raw_data[0]
res = client.embeddings.create(
    input=[
        data
    ],
    model=embed_model
)


In [None]:
# 作成されたEmbeddingを見てみる(Skip可)
print(len(res.data))
print(len(res.data[0].embedding))
print(res.data[0].embedding)
print(res)

In [4]:
# init pinecone
index_name = "sandbox"
# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=pinecone_api_key,
    environment="PINECONE_ENV"
)

In [5]:
# see pinecone which was initiated 
print(pinecone.list_indexes())

['sandbox']


In [None]:
# create index in pinecone
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res.data[0].embedding),
        metric='cosine'
    )
# connect to index
index = pinecone.Index(index_name)
# view index stats
index.describe_index_stats()

In [None]:
# create embeddings for all csv raws
vectors = []
# print(raw_data)
res = client.embeddings.create(
    input=raw_data,
    model=embed_model
)
print(len(res.data))
print(res)


In [None]:
# upserts the vectors created by openai embeddings to pinecone
for i, data in enumerate(raw_data):
    v = res.data[i].embedding
    print(i, v, data)
    vectors.append(
        {
            'id': str(i), 
            'values': v,
            'metadata': {
                'text': data
            }
        }
    )
    
index.upsert(vectors)

In [7]:
# prepare the query, embeddings it and query to pinecone
query = (
    "QUERY_TEXT"
)
res = client.embeddings.create(
    input=[query],
    model=embed_model
)

# retrieve from Pinecone
xq = res.data[0].embedding

# get relevant contexts (including the questions)
res = index.query(xq, top_k=2, include_metadata=True)

In [None]:
# check the response from pinecone
res

In [None]:
# the core of 検索拡張生成 (RAG: Retrieval Augmented Generation)
# query the openai with external knowledge fetched by the query result from pinecone
model = 'GPT_MODEL'
context = "\n\n---\n\n".join([item['metadata']['text'] for item in res['matches']])
# context = res['matches'][0]['metadata']['text']
question = "QUESTION_TO_RAG_APP"
ans = client.chat.completions.create(
    model=model,
    messages=[
                {"role": "system", "content": """あなたは、精確な野球の記者です。下記のコンテキストをもとに、質問に回答してください。
                    もしコンテキスト内に回答がなければ、次のように回答してください。 \"私が知っている限りにおいて、回答することはできません。\"\n\n 
                    では、深呼吸をして、この問題に一歩一歩取り組んでいきましょう。\n\n"""},
                {"role": "user", "content": "コンテキスト: " + context + "\n\n---\n\n質問: " + question + "\n回答:"}
            ],
            temperature=0,
            # max_tokens=1800,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
)
print(ans)
print(ans.choices[0].message.content)

In [None]:
# fetch the record from pinecone
index.fetch(['9'])

In [None]:
# delete all records in pinecone
id_ttl = 9
id_list = [str(i) for i in range(id_ttl)]
print(id_list)
index.delete(ids=id_list)