In [28]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import tiktoken
from langchain.text_splitter import TokenTextSplitter
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS
import pandas as pd
import re

load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [29]:
# Check encoding model for our llm
llm = "gpt-3.5-turbo"
tiktoken.encoding_for_model(llm)

<Encoding 'cl100k_base'>

In [30]:
# Create tokenizer based on our llm's encoding model
tokenizer = tiktoken.get_encoding('cl100k_base')
tokenizer

<Encoding 'cl100k_base'>

In [31]:
encoder_name = "cl100k_base"
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    # encoding_name=encoder_name,
    chunk_size=500,
    chunk_overlap=100,
    allowed_special={'<|endoftext|>'}
    # separators=["\n\n", "\n", " ", ""]
)

In [32]:
data = pd.read_json("small_arxiv_cleaned.json")
data = data[data['text'] != '']
print(len(data))
data.head()

56


Unnamed: 0,authors,categories,comment,doi,entry_id,journal_ref,pdf_url,primary_category,published,summary,title,updated,text
0,"[{'name': 'Zhuang Liu'}, {'name': 'Hanzi Mao'}...",[cs.CV],CVPR 2022; Code: https://github.com/facebookre...,,http://arxiv.org/abs/2201.03545v2,,http://arxiv.org/pdf/2201.03545,cs.CV,1641841150000,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s,1646233696000,"Looking back at the 2010s, the decade was mar..."
1,"[{'name': 'Ze Liu'}, {'name': 'Yutong Lin'}, {...","[cs.CV, cs.LG]",,,http://arxiv.org/abs/2103.14030v2,,http://arxiv.org/pdf/2103.14030,cs.CV,1616695171000,"This paper presents a new vision Transformer, ...",Swin Transformer: Hierarchical Vision Transfor...,1629218494000,Modeling in computer vision has long been dom...
2,"[{'name': 'Alexey Dosovitskiy'}, {'name': 'Luc...","[cs.CV, cs.AI, cs.LG]",Fine-tuning code and pre-trained models are av...,,http://arxiv.org/abs/2010.11929v2,,http://arxiv.org/pdf/2010.11929,cs.CV,1603389359000,While the Transformer architecture has become ...,An Image is Worth 16x16 Words: Transformers fo...,1622725736000,While the Transformer architecture has become...
3,"[{'name': 'Dan Friedman'}, {'name': 'Alexander...","[cs.LG, cs.CL]","Our code, and example Transformer Programs, ar...",,http://arxiv.org/abs/2306.01128v1,,http://arxiv.org/pdf/2306.01128,cs.LG,1685651221000,Recent research in mechanistic interpretabilit...,Learning Transformer Programs,1685651221000,Recent research in mechanistic interpretabili...
4,"[{'name': 'Lingjiao Chen'}, {'name': 'Matei Za...","[cs.LG, cs.AI, cs.CL, cs.SE]",,,http://arxiv.org/abs/2305.05176v1,,http://arxiv.org/pdf/2305.05176,cs.LG,1683609062000,There is a rapidly growing number of large lan...,FrugalGPT: How to Use Large Language Models Wh...,1683609062000,There is a rapidly growing number of large la...


In [33]:
model_name = 'text-embedding-ada-002'

# embed = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
embed = OpenAIEmbeddings(
    model=model_name,
)

In [34]:
# Need to split the 'text' column into chunks for each row

metadatas = []
for i, text in enumerate(data['text']):
    metadata = {
        "authors": [author['name'] for author in data.iloc[i]['authors']],
        "pdf_url": data.iloc[i]['pdf_url'],
        "summary": data.iloc[i]['summary'],
        "title": data.iloc[i]['title']
    }
    chunks = text_splitter.split_text(
        text
    )

    chunks_metadata = [{
        "chunk_id": j, "chunk": text, **metadata
    } for j, text in enumerate(chunks)]

    metadatas.extend(chunks_metadata)

In [44]:
data_chunks = pd.DataFrame(metadatas)
print(len(data_chunks))
data_chunks.head()

2247


Unnamed: 0,chunk_id,chunk,authors,pdf_url,summary,title
0,0,"Looking back at the 2010s, the decade was mar...","[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
1,1,"is translation equivariance, which is a desir...","[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
2,2,a quadratic complexity with respect to the in...,"[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
3,3,cantly in the training procedure and macro/mic...,"[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s
4,4,B regime which has FLOPs around 15.0 × 109. F...,"[Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christop...",http://arxiv.org/pdf/2201.03545,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s


In [45]:
# Convert 'chunk' column to lower-case and strip leading/trailing whitespace
data_chunks['chunk'] = data_chunks['chunk'].str.lower().str.strip()
data_chunks['summary'] = data_chunks['summary'].str.lower().str.strip()

# Replace '<|endoftext|>' with empty string
data_chunks['chunk'] = data_chunks['chunk'].str.replace('<|endoftext|>', '', regex=False)
data_chunks['summary'] = data_chunks['summary'].str.replace('<|endoftext|>', '', regex=False)

# Remove all newline and non-alphanumeric characters from the 'chunk' and 'summary' columns
data_chunks['chunk'] = data_chunks['chunk'].apply(lambda x: re.sub('\W+',' ', x))
data_chunks['chunk'] = data_chunks['chunk'].apply(lambda x: x.replace('\n', ' '))
data_chunks['summary'] = data_chunks['summary'].apply(lambda x: re.sub('\W+',' ', x))
data_chunks['summary'] = data_chunks['summary'].apply(lambda x: x.replace('\n', ' '))

In [46]:
data_docs = DataFrameLoader(data_chunks, page_content_column='chunk')
docs = data_docs.load()

In [47]:
len(docs)

2247

In [48]:
docs_batch1 = docs[:1000]
docs_batch2 = docs[1000:2000]
docs_batch3 = docs[2000:]

In [49]:
# Create vector store
vectorstore = FAISS.from_documents(docs_batch1, embed)

In [50]:
# Add to vector store (work around from OpenAI rate limit error)
vectorstore.add_documents(docs_batch2)

['6185eab9-de3b-4f7f-a781-b047c0b56f22',
 'ae80b088-da93-49a1-b3ba-6f45804ff14a',
 'de5135d5-1312-400a-931e-72453045c3c1',
 '84fb2b09-7588-4b6f-b735-9a8f0efff4da',
 '7326ad56-2417-4f43-8489-bb24ded45db6',
 '0dad5b28-e412-48e6-af1b-f6f6bfc5966d',
 'fc35f731-3099-4809-80a7-250ac07569a2',
 '343952d6-cafc-4309-bb06-1170ab61e06d',
 'e66747d9-438a-424f-975c-e42ec99b85fc',
 'e3d04bda-19b9-4556-af1f-1ed9419064ea',
 'cd19f9da-6677-4ef3-a9ee-5a84125de2d9',
 '6e24ee46-c3d4-460d-b157-cbe71cbbaf74',
 'f01b7212-c66d-429d-9347-36f53ef9b68c',
 '3033d253-8e80-4514-b2ce-88fc75218e7d',
 'b4d2a474-6911-47af-bbce-39372e503695',
 '5c88a530-1942-4c8b-9964-97083ed190eb',
 '33304104-8060-48d0-a825-5a6f9a8f8ef5',
 '31e73204-4d8d-490c-a968-eaea74ea34e7',
 '74e5e2e8-de42-4b8e-ab90-249fe0167de8',
 '9c02b640-11b1-4e55-95e9-8c624f8ce3d0',
 'ed7eaf10-d54b-4c51-a3ef-3cea4cc4f0e5',
 '941cf26a-dcc7-4b97-b0eb-182a109478d4',
 '233b5227-8bd2-4f05-bf47-a43ab87fdd9b',
 'db19b477-6203-40c3-8c04-a5625dfbe0bb',
 'd5b72f0c-4c46-

In [51]:
vectorstore.add_documents(docs_batch3)

['bb729145-a6c4-488b-a413-fb481d80b0a4',
 'c0420598-e072-4d13-b3e7-1e2c9840b1cd',
 '136d76e8-5721-4f8c-b1f1-2f129fb43522',
 '13db155b-694d-49e8-b129-9ef903d8cbd3',
 '02ff4c76-b4bd-4d6d-9804-d9ce5a7f0114',
 'eff7679c-92be-453f-97b8-a58bb8c14201',
 'a187a3fd-e077-4903-96a5-a56b3094f0bc',
 'e01043b8-174e-4130-ae71-f26dffad4b9e',
 '4bfbc825-ccd6-4798-9d7d-f0961ff127e6',
 '45aef774-2670-445e-9c36-45ad7ceb1640',
 '1200f505-83d0-483b-a5ef-1a64c1f72067',
 'ecdd6ed2-9a1d-4ccb-9d96-a50a88d94e8b',
 'a269a328-f5ea-48ff-a294-612a8f64c04a',
 'fe7e5cbf-4043-4411-9c05-58b0516f263e',
 'f63470d3-2958-4de9-a6b7-86881ab89aed',
 '40a30f03-921e-4e3e-b569-a0710614abaa',
 '9b48bd06-37bb-4fad-acac-d93de5dcf51d',
 'dbd94de8-d41f-4f63-9b3d-6907d949d531',
 'e0cd79e8-7871-45fa-9c16-660c8e5b1938',
 'b73f9c5e-f27c-4b16-b17e-7a90928f854d',
 'f92b62ff-b859-4803-8da6-4180761cf4ea',
 'c92f44a1-1690-410b-a489-735fa5cf0299',
 'fb1ccaef-1ca1-40e4-9b07-f0e2839bbc82',
 'af356117-5470-4bd1-981f-8b79bc75fb80',
 '3980a800-1ab2-

In [52]:
len(vectorstore.index_to_docstore_id)

2247

In [53]:
vectorstore.save_local("arxiv_vectorstore")