In [1]:
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
import deeplake
import pandas as pd
from langchain.document_loaders import DataFrameLoader

In [2]:
'''
Load the subsection dataframe
'''

df = pd.read_csv('../data/subsections.csv')

In [None]:
'''
Initialize the embedding model
'''

from langchain.embeddings import HuggingFaceHubEmbeddings

hugginfacehub_api_token = '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'
repo_id = "sentence-transformers/all-MiniLM-L12-v2"


embedding = HuggingFaceHubEmbeddings(
    repo_id=repo_id,
    task="feature-extraction",
    huggingfacehub_api_token=hugginfacehub_api_token,
)

## Generate Embeddings

In [None]:
'''
This cell generates sentence embeddings for each of the subsections of the book
Only run once!
'''

EMBEDDING_PATH = '../bin/embeddings/'

for i in df['chapter'].drop_duplicates():
    df1 = df[df['chapter']==i]
    for j in df1['section'].drop_duplicates():
        df2 = df1[df1['section']==j]
        db = DeepLake(dataset_path=f"{EMBEDDING_PATH}{i}-{j}", embedding_function=embedding)
        loader = DataFrameLoader(df2, page_content_column = 'clean_text')
        db.add_documents(loader.load())

## Inference

In [13]:
chapter = 1
section = 1
summary = 'Economics seeks to solve the problem of scarcity, which is when human wants for goods and services exceed the available supply. A modern economy displays a division of labor, in which people earn income by specializing in what they produce and then use that income to purchase the products they need or want. The division of labor allows individuals and firms to specialize and to produce more for several reasons: a) It allows the agents to focus on areas of advantage due to natural factors and skill levels; b) It encourages the agents to learn and invent; c) It allows agents to take advantage of economies of scale. Division and specialization of labor only work when individuals can purchase what they do not produce in markets. Learning about economics helps you understand the major problems facing the world today, prepares you to be a good citizen, and helps you become a well-rounded thinker.'

db = DeepLake(dataset_path=f"../bin/embeddings/{chapter}-{section}", 
              embedding_function=embedding, 
              read_only=True)

docs = db.similarity_search_with_score(summary, distance_metric='cos', k=20)

../bin/embeddings/1-1 loaded successfully.







Deep Lake Dataset in ../bin/embeddings/1-1 already exists, loading from the storage
Dataset(path='../bin/embeddings/1-1', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape     dtype  compression
  -------   -------   -------   -------  ------- 
 embedding  generic  (10, 384)  float32   None   
    ids      text     (10, 1)     str     None   
 metadata    json     (10, 1)     str     None   
   text      text     (10, 1)     str     None   


In [14]:
print('The most similar subsection was', docs[0][0].metadata['heading'])
print('The similarity score was', docs[0][1])

The most similar subsection was Overview
The similarity score was 0.6686459183692932
