# How to set up this notebook

```
python3 -m venv venv_embeddings_save
pip install -r requirements_embeddings_save.txt
```

In [1]:
import os
import json
import pickle
import time
import sys
import argparse
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index import (
    GPTVectorStoreIndex, StorageContext, ServiceContext, LangchainEmbedding)
from llama_index.vector_stores import ChromaVectorStore
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import chromadb
import pandas as pd

In [2]:
from embeddings_save import (
    save_dfs)

In [3]:
config_dict = None
config_file = f"/app/config.json"
with open(config_file) as json_file:
    config_dict = json.load(json_file)

In [4]:
CHUNK_SIZE = 512
CHUNK_OVERLAP = 32
TRAIN_CUTOFF_YEAR = 2017
NUM_SAMPLES_TRAIN = 10 # 1000
NUM_SAMPLES_TEST = 5 # 500

In [5]:
# Read the targets df generated from make_targets.py
df_targets = pd.read_pickle(config_dict['targets_df_path'])

In [6]:
df_targets_train = \
    df_targets.loc[lambda x: x.era <= TRAIN_CUTOFF_YEAR].reset_index(drop=True)
df_targets_test = \
    df_targets.loc[lambda x: x.era > TRAIN_CUTOFF_YEAR].reset_index(drop=True)
df_targets_train_sampled = \
    df_targets_train.sample(n=NUM_SAMPLES_TRAIN).reset_index(drop=True)
df_targets_test_sampled = \
    df_targets_test.sample(n=NUM_SAMPLES_TEST).reset_index(drop=True)

In [7]:

save_dfs(df_targets_train_sampled, df_targets_test_sampled, config_dict)

In [8]:
embedding_model = LangchainEmbedding(
        HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2")
    )

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def save_index_02(embeddings_path, embedding_model, 
                  symbol, ar_date, config_dict):
    db = chromadb.PersistentClient(
        path=os.path.join(embeddings_path, symbol, ar_date))
    chroma_collection = db.create_collection("ar_date")
    
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    service_context = ServiceContext.from_defaults(
        llm=None,
        embed_model=embedding_model,
        chunk_size = CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP)
    
    ar_filing_path = os.path.join(
        config_dict['annual_reports_pdf_save_directory'], symbol)
    ar_fn = f"{ar_filing_path}/{ar_date}.pdf"
    documents = SimpleDirectoryReader(input_files=[ar_fn]).load_data()
    
    _ = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, 
        service_context=service_context
    )

def save_embeddings_02(df, embedding_model, save_directory, config_dict):
    for i in df.index:
        start_time = time.time()
        curr_series = df.loc[i]
        symbol = curr_series['symbol']
        ar_date = curr_series['report_date'].date().strftime('%Y-%m-%d')


        save_path = os.path.join(save_directory, symbol, ar_date)
        if os.path.exists(save_path):
            # Don't need to save twice
            continue

        save_index_02(
            save_directory, embedding_model,  symbol, ar_date, config_dict)

        print("Completed: {}, {}, {} in {:.2f}s".format(i+1, symbol, ar_date, time.time()-start_time))

In [10]:
save_embeddings_02(
    df_targets_train_sampled, embedding_model, 
    config_dict['embeddings_for_training_directory'], config_dict)

LLM is explicitly disabled. Using MockLLM.
Completed: 1, INTC, 2002-12-28 in 354.38s
LLM is explicitly disabled. Using MockLLM.
Completed: 2, INTC, 2003-12-27 in 259.87s
LLM is explicitly disabled. Using MockLLM.
Completed: 3, NVDA, 2005-01-30 in 195.48s
LLM is explicitly disabled. Using MockLLM.
Completed: 4, NVDA, 2007-01-28 in 232.52s
LLM is explicitly disabled. Using MockLLM.
Completed: 5, NVDA, 2006-01-29 in 204.14s
LLM is explicitly disabled. Using MockLLM.
Completed: 6, NVDA, 2002-01-27 in 137.89s
LLM is explicitly disabled. Using MockLLM.
Completed: 7, INTC, 2007-12-29 in 226.25s
LLM is explicitly disabled. Using MockLLM.
Completed: 8, NVDA, 2003-01-26 in 149.01s
LLM is explicitly disabled. Using MockLLM.
Completed: 9, INTC, 2004-12-25 in 202.72s
LLM is explicitly disabled. Using MockLLM.
Completed: 10, INTC, 2013-12-28 in 311.22s


In [11]:
save_embeddings_02(
    df_targets_test_sampled, embedding_model, 
    config_dict['embeddings_for_testing_directory'], config_dict)

LLM is explicitly disabled. Using MockLLM.
Completed: 1, NVDA, 2018-01-28 in 180.82s
LLM is explicitly disabled. Using MockLLM.
Completed: 2, INTC, 2021-12-25 in 236.55s
LLM is explicitly disabled. Using MockLLM.
Completed: 3, NVDA, 2019-01-27 in 172.84s
LLM is explicitly disabled. Using MockLLM.
Completed: 4, NVDA, 2020-01-26 in 186.80s
LLM is explicitly disabled. Using MockLLM.
Completed: 5, INTC, 2020-12-26 in 328.49s
