<a href="https://colab.research.google.com/github/karaage0703/ai-karaage-sensei-bot/blob/main/notebooks/Llama_2_LangChain_RetrievalQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Llama 2 + LangChain の RetrievalQA

以下サイトを参考にNotebook化

- https://note.com/npaka/n/n3164e8b24539
- https://note.com/npaka/n/n6d33c2181050

ライブラリのインストール

In [None]:
!pip -qq install langchain accelerate bitsandbytes sentence_transformers
!pip -qq install faiss-gpu

インポート

In [None]:
import logging
import sys
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.llms import OpenAIChat

ログレベル設定

In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)

Google Driveとの連携

In [None]:
from google.colab import drive
drive.mount('/content/drive')

データコピー

In [None]:
!cp /content/drive/MyDrive/colab_data/analysis_normalized_tweet_1.txt ./
!cp /content/drive/MyDrive/colab_data/analysis_normalized_tweet_2.txt ./
!cp /content/drive/MyDrive/colab_data/blog_text.txt ./

ドキュメントの読み込みとチャンクの分割・確認

In [None]:
# ドキュメントの読み込み
with open("analysis_normalized_tweet_1.txt", encoding="utf-8") as f:
    test_all = f.read()

print(len(test_all))

# ドキュメントの読み込み
with open("analysis_normalized_tweet_2.txt", encoding="utf-8") as f:
    test_all += f.read()

print(len(test_all))

# ドキュメントの読み込み
with open("blog_text.txt", encoding="utf-8") as f:
    test_all += f.read()

print(len(test_all))

In [None]:
# チャンクの分割
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=514,
    chunk_overlap=20,
)
texts = text_splitter.split_text(test_all)

# チャンクの確認
print(len(texts))
for text in texts:
    print(text[:10].replace("\n", "\\n"), ":", len(text))

インデックスの作成

In [None]:
index = FAISS.from_texts(
    texts=texts,
    embedding=HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large"),
)

インデクスのセーブ

In [None]:
index.save_local('storage')

データをGoogle ColabからGoogle Driveへ移動

In [None]:
!zip -r storage.zip storage

In [None]:
!cp storage.zip /content/drive/MyDrive/colab_data/

# 参考リンク

- https://qiita.com/taka_yayoi/items/c8c612b18ebc9f5cae8c