In [1]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS
import pandas as pd

load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
# Check encoding model for our llm
llm = "gpt-3.5-turbo"
tiktoken.encoding_for_model(llm)

<Encoding 'cl100k_base'>

In [3]:
# Create tokenizer based on our llm's encoding model
tokenizer = tiktoken.get_encoding('cl100k_base')
tokenizer

<Encoding 'cl100k_base'>

In [4]:
encoder_name = "cl100k_base"
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    # encoding_name=encoder_name,
    chunk_size=500,
    chunk_overlap=100,
    allowed_special={'<|endoftext|>'}
    # separators=["\n\n", "\n", " ", ""]
)

In [5]:
data = pd.read_json("arxiv_data_add_clean.json")
print(len(data))
data.head()

56


Unnamed: 0,authors,categories,comment,doi,entry_id,journal_ref,pdf_url,primary_category,published,summary,title,updated,text
0,"[{'name': 'Zhuang Liu'}, {'name': 'Hanzi Mao'}...",[cs.CV],CVPR 2022; Code: https://github.com/facebookre...,,http://arxiv.org/abs/2201.03545v2,,http://arxiv.org/pdf/2201.03545,cs.CV,1641841150000,"The ""Roaring 20s"" of visual recognition began ...",A ConvNet for the 2020s,1646233696000,"A ConvNet for the 2020s\n\nZhuang Liu1,2* Hanz..."
1,"[{'name': 'Ze Liu'}, {'name': 'Yutong Lin'}, {...","[cs.CV, cs.LG]",,,http://arxiv.org/abs/2103.14030v2,,http://arxiv.org/pdf/2103.14030,cs.CV,1616695171000,"This paper presents a new vision Transformer, ...",Swin Transformer: Hierarchical Vision Transfor...,1629218494000,Swin Transformer: Hierarchical Vision Transfor...
2,"[{'name': 'Alexey Dosovitskiy'}, {'name': 'Luc...","[cs.CV, cs.AI, cs.LG]",Fine-tuning code and pre-trained models are av...,,http://arxiv.org/abs/2010.11929v2,,http://arxiv.org/pdf/2010.11929,cs.CV,1603389359000,While the Transformer architecture has become ...,An Image is Worth 16x16 Words: Transformers fo...,1622725736000,1\n2\n0\n2\n\nn\nu\nJ\n\n3\n\n]\n\nV\nC\n.\ns\...
3,"[{'name': 'Dan Friedman'}, {'name': 'Alexander...","[cs.LG, cs.CL]","Our code, and example Transformer Programs, ar...",,http://arxiv.org/abs/2306.01128v1,,http://arxiv.org/pdf/2306.01128,cs.LG,1685651221000,Recent research in mechanistic interpretabilit...,Learning Transformer Programs,1685651221000,3\n2\n0\n2\n\nn\nu\nJ\n\n1\n\n]\n\nG\nL\n.\ns\...
4,"[{'name': 'Lingjiao Chen'}, {'name': 'Matei Za...","[cs.LG, cs.AI, cs.CL, cs.SE]",,,http://arxiv.org/abs/2305.05176v1,,http://arxiv.org/pdf/2305.05176,cs.LG,1683609062000,There is a rapidly growing number of large lan...,FrugalGPT: How to Use Large Language Models Wh...,1683609062000,3\n2\n0\n2\n\ny\na\nM\n9\n\n]\n\nG\nL\n.\ns\nc...


In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from uuid import uuid4
import time
model_name = 'text-embedding-ada-002'

# embed = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
embed = OpenAIEmbeddings(
    model=model_name,
)

In [28]:
# Need to split the 'text' column into chunks for each row

batch_limit = 3
texts = []
metadatas = []
for i, text in enumerate(data['text']):
    metadata = {
        # "authors": [author['name'] for author in data.iloc[i]['authors']],
        "pdf_url": data.iloc[i]['pdf_url'],
        # "summary": data.iloc[i]['summary'],
        "title": data.iloc[i]['title']
    }
    chunks = text_splitter.split_text(
        text
    )

    chunks_metadata = [{
        "chunk_id": j, "chunk": text, **metadata
    } for j, text in enumerate(chunks)]

    texts.extend(chunks)
    metadatas.extend(chunks_metadata)
    # if i == 10:
    #     break

In [46]:
data_chunks = pd.DataFrame(metadatas)
data_chunks.head()

Unnamed: 0,chunk_id,chunk,pdf_url,title
0,0,"A ConvNet for the 2020s\n\nZhuang Liu1,2* Hanz...",http://arxiv.org/pdf/2201.03545,A ConvNet for the 2020s
1,1,\nexploration is a family of pure ConvNet mode...,http://arxiv.org/pdf/2201.03545,A ConvNet for the 2020s
2,2,"[59], ushering in a new\nera of computer visi...",http://arxiv.org/pdf/2201.03545,A ConvNet for the 2020s
3,3,", as the Transformers replaced recurrent neura...",http://arxiv.org/pdf/2201.03545,A ConvNet for the 2020s
4,4,"work in this\ndirection, demonstrating for th...",http://arxiv.org/pdf/2201.03545,A ConvNet for the 2020s


In [49]:
# Convert 'chunk' column to lower-case and strip leading/trailing whitespace
data_chunks['chunk'] = data_chunks['chunk'].str.lower().str.strip()

# Count occurrences of '<hello>' before replacement
count_before = data_chunks['chunk'].str.count('<|endoftext|>').sum()
print(f'The substring <|endoftext|> occurs {count_before} times before replacement.')

# Replace '<|endoftext|>' with empty string
data_chunks['chunk'] = data_chunks['chunk'].str.replace('<|endoftext|>', '', regex=False)

# Count occurrences of '<|endoftext|>' after replacement
count_after = data_chunks['chunk'].str.count('<|endoftext|>').sum()
print(f'The substring <|endoftext|> occurs {count_after} times after replacement.')

The substring <|endoftext|> occurs 1864 times before replacement.
The substring <|endoftext|> occurs 1864 times after replacement.


In [50]:
data_docs = DataFrameLoader(data_chunks, page_content_column='chunk')
docs = data_docs.load()

In [52]:
len(docs)

4434

In [54]:
docs_batch1 = docs[:10]
docs_batch2 = docs[10:20]
docs_batch3 = docs[2000:3000]
docs_batch4 = docs[3000:4000]
docs_batch5 = docs[4000:]

In [55]:
# Create vector store
vectorstore = FAISS.from_documents(docs_batch1, embed)

# Add to vector store (work around from OpenAI rate limit error)
vectorstore.aadd_documents(docs_batch2) # do this for all batches

In [70]:
query = "what can you tell me about GPT?"

# Similarity search
test_search = vectorstore.similarity_search_with_score(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

# Extract text content and metadata from query results
first_chunk = test_search[0][0].page_content
first_metadata = test_search[0][0].metadata

In [72]:
vectorstore.save_local("test_index")

In [73]:
loaded_vectorstore = FAISS.load_local("test_index", embeddings=embed)

In [74]:
query = "what can you tell me about BERT?"

loaded_vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content=', as the transformers replaced recurrent neural\nnetworks to become the dominant backbone architecture.\ndespite the disparity in the task of interest between language\nand vision domains, the two streams surprisingly converged\nin the year 2020, as the introduction of vision transformers\n(vit) completely altered the landscape of network architec-\nture design. except for the initial “patchify” layer, which\nsplits an image into a sequence of patches, vit introduces no\nimage-speciﬁc inductive bias and makes minimal changes\nto the original nlp transformers. one primary focus of\nvit is on the scaling behavior: with the help of larger model\nand dataset sizes, transformers can outperform standard\nresnets by a signiﬁcant margin. those results on image\nclassiﬁcation tasks are inspiring, but computer vision is not\nlimited to image classiﬁcation. as discussed previously,\nsolutions to numerous computer vision tasks in the past\ndecade depended signiﬁcantly on a 