In [10]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import tiktoken
from langchain.text_splitter import TokenTextSplitter
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS
import pandas as pd

load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:
# Check encoding model for our llm
llm = "gpt-3.5-turbo"
tiktoken.encoding_for_model(llm)

<Encoding 'cl100k_base'>

In [4]:
# Create tokenizer based on our llm's encoding model
tokenizer = tiktoken.get_encoding('cl100k_base')
tokenizer

<Encoding 'cl100k_base'>

In [5]:
encoder_name = "cl100k_base"
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    # encoding_name=encoder_name,
    chunk_size=500,
    chunk_overlap=100,
    allowed_special={'<|endoftext|>'}
    # separators=["\n\n", "\n", " ", ""]
)

In [9]:
data = pd.read_json("small_arxiv_cleaned.json")
data = data[data['text'] != '']
print(len(data))
data.head()

56


Unnamed: 0,authors,categories,comment,doi,entry_id,journal_ref,pdf_url,primary_category,published,summary,title,updated,text
0,"[{'name': 'Zhuang Liu'}, {'name': 'Hanzi Mao'}...",[cs.CV],CVPR 2022; Code: https://github.com/facebookre...,,http://arxiv.org/abs/2201.03545v2,,http://arxiv.org/pdf/2201.03545,cs.CV,1641841150000,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s,1646233696000,"Looking back at the 2010s, the decade was mar..."
1,"[{'name': 'Ze Liu'}, {'name': 'Yutong Lin'}, {...","[cs.CV, cs.LG]",,,http://arxiv.org/abs/2103.14030v2,,http://arxiv.org/pdf/2103.14030,cs.CV,1616695171000,"This paper presents a new vision Transformer, ...",Swin Transformer: Hierarchical Vision Transfor...,1629218494000,Modeling in computer vision has long been dom...
2,"[{'name': 'Alexey Dosovitskiy'}, {'name': 'Luc...","[cs.CV, cs.AI, cs.LG]",Fine-tuning code and pre-trained models are av...,,http://arxiv.org/abs/2010.11929v2,,http://arxiv.org/pdf/2010.11929,cs.CV,1603389359000,While the Transformer architecture has become ...,An Image is Worth 16x16 Words: Transformers fo...,1622725736000,While the Transformer architecture has become...
3,"[{'name': 'Dan Friedman'}, {'name': 'Alexander...","[cs.LG, cs.CL]","Our code, and example Transformer Programs, ar...",,http://arxiv.org/abs/2306.01128v1,,http://arxiv.org/pdf/2306.01128,cs.LG,1685651221000,Recent research in mechanistic interpretabilit...,Learning Transformer Programs,1685651221000,Recent research in mechanistic interpretabili...
4,"[{'name': 'Lingjiao Chen'}, {'name': 'Matei Za...","[cs.LG, cs.AI, cs.CL, cs.SE]",,,http://arxiv.org/abs/2305.05176v1,,http://arxiv.org/pdf/2305.05176,cs.LG,1683609062000,There is a rapidly growing number of large lan...,FrugalGPT: How to Use Large Language Models Wh...,1683609062000,There is a rapidly growing number of large la...


In [11]:
model_name = 'text-embedding-ada-002'

# embed = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
embed = OpenAIEmbeddings(
    model=model_name,
)

In [12]:
# Need to split the 'text' column into chunks for each row

metadatas = []
for i, text in enumerate(data['text']):
    metadata = {
        "authors": [author['name'] for author in data.iloc[i]['authors']],
        "pdf_url": data.iloc[i]['pdf_url'],
        "summary": data.iloc[i]['summary'],
        "title": data.iloc[i]['title']
    }
    chunks = text_splitter.split_text(
        text
    )

    chunks_metadata = [{
        "chunk_id": j, "chunk": text, **metadata
    } for j, text in enumerate(chunks)]

    metadatas.extend(chunks_metadata)

In [13]:
data_chunks = pd.DataFrame(metadatas)
print(len(data_chunks))
data_chunks.head()

2247


Unnamed: 0,chunk_id,chunk,authors,pdf_url,summary,title
0,0,"Looking back at the 2010s, the decade was mar...","[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
1,1,"is translation equivariance, which is a desir...","[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
2,2,a quadratic complexity with respect to the in...,"[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
3,3,cantly in the training procedure and macro/mic...,"[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
4,4,B regime which has FLOPs around 15.0 × 109. F...,"[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s


In [18]:
# Convert 'chunk' column to lower-case and strip leading/trailing whitespace
data_chunks['chunk'] = data_chunks['chunk'].str.lower().str.strip()

# Replace '<|endoftext|>' with empty string
data_chunks['chunk'] = data_chunks['chunk'].str.replace('<|endoftext|>', '', regex=False)

The substring <|endoftext|> occurs 1210 times before replacement.
The substring <|endoftext|> occurs 1204 times after replacement.


In [19]:
data_docs = DataFrameLoader(data_chunks, page_content_column='chunk')
docs = data_docs.load()

In [20]:
len(docs)

2247

In [21]:
docs_batch1 = docs[:1000]
docs_batch2 = docs[1000:2000]
docs_batch3 = docs[2000:]

In [22]:
# Create vector store
vectorstore = FAISS.from_documents(docs_batch1, embed)

In [23]:
# Add to vector store (work around from OpenAI rate limit error)
vectorstore.add_documents(docs_batch2)

['40253046-044f-4f0e-897f-e6a3c6d1ddf8',
 '4ff0680b-f5f0-49c8-b07d-e98d1861fc47',
 '79afa99d-b509-49d3-82e6-6272e8ba3c6f',
 'c04abaf0-a401-40f2-8e01-8459de300bb8',
 '495afba0-4534-4e80-907b-5d16313945f8',
 'b6497ac6-e991-4301-8c7d-7bdcdbe3104e',
 'c7da2de3-0b25-4ce2-bcd0-4cf3589e8540',
 'b46176f9-27df-4a21-bc01-01f82a9d3801',
 '97b793f9-3b48-4823-b8d0-7bc8fb215b71',
 'ffeec08b-8ee5-431d-9934-fef85b7f9f14',
 '83faacea-aff0-4b25-82f3-3baf791c4913',
 '9cd5c649-4ff2-49e2-a1fa-3d869270a6d0',
 '2b99ef78-4694-4efa-8032-dad9d58b6380',
 'b29aeba7-00be-4888-a46c-0eab0f66d2e9',
 '756fce1f-058e-429b-bd54-186bcc4de278',
 'd365819a-f553-4005-9d53-8ed5360bb234',
 '799c6281-587b-4896-8d2b-d0ffa9417abc',
 '03e15067-fddd-4df6-bae4-00bd6a69964b',
 'd49c73f1-40f5-4807-b40c-69028530fc47',
 '441f6106-e425-43fc-a560-c2e311f3c908',
 '840644a8-7fd0-4ae4-bde6-3fbc639e7a8f',
 '6475a27a-f40b-412c-91ec-9b5e79b93857',
 'c61b726f-f6ca-45eb-a6f8-62f3e26cdd1f',
 '8045e9c8-2658-4db4-81c2-9b6c4c182d22',
 'bdd98afa-4f33-

In [24]:
vectorstore.add_documents(docs_batch3)

['6e14300a-abe6-4316-ac3a-09dfd4c53b5f',
 '29fdfe50-d049-4695-af8c-dd4bd2893353',
 '1b030d79-b9e1-4e9a-bbf8-4acad3a22541',
 '0249df15-06f4-43d8-b223-8202560224a8',
 '4d2fb11c-e1ac-44c9-9166-15075b29604a',
 'd4804742-76bf-4a45-b41d-e1edf86dafea',
 '30c12b7b-b48d-423c-babf-8751f1689e5c',
 '210f896f-5a0d-495a-a6f1-7970faf44e96',
 '8fbf380c-44a4-4203-800d-ac4b9272b1ea',
 'f46d7ac1-82bf-40ed-a1e0-80be2df09fbb',
 '676b7828-ff8d-4aed-963c-2fa112dd4955',
 '1c5af501-1f0c-4abb-a0a4-4db7c7f8d488',
 '3efbd8a9-4d56-4888-bae1-19abd554378b',
 '2a6da4fb-8e0b-4440-a827-2bd04eb16166',
 '507dc4ce-0111-4793-9e5c-e166985e11dc',
 'ca37a044-9cd7-412c-b4c2-4d03322c2238',
 'a75eca72-adc7-436e-91d3-76c52942f2de',
 '895faa16-50bd-4993-bb48-31de29ac6ae6',
 'ff4bfc8c-475f-45df-9405-afd3323d1f7f',
 'ef54309c-8bca-42c5-909b-c0db4c47dba0',
 'a6a82172-55ef-42f7-aabe-a99d680694db',
 '31e26a13-58d4-466b-ac07-4b417190a71b',
 '8cba1636-eafe-4d71-946c-a91e508b26d3',
 '5e52bafd-1c37-4c2f-9bc7-6665cce598fa',
 'd0c703c4-47c8-

In [25]:
len(vectorstore.index_to_docstore_id)

2247

In [26]:
vectorstore.save_local("arxiv_vectorstore")