# Retrieval System
This notebook implementes the retrievel system

In [23]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
import spacy

class RetrievalSystem:
    def __init__(self, path: str):
        """
        Constructor to initialize the RetrievalSystem with a CSV file.
        Args:
            path (str): The path to the CSV file to load.
        """
        self.model_type = 'bert-base-nli-mean-tokens'

        if os.path.exists(path):
            self.data = pd.read_csv(path)
        self.model = SentenceTransformer(self.model_type)
        self.nlp = spacy.load("en_core_web_sm")  # Load spaCy for preprocessing

    def preprocess_text(self, text: str) -> str:
        """
        Preprocesses the input text by removing stop words and applying lemmatization.
        Args:
            text (str): The text to preprocess.
        Returns:
            str: The preprocessed text.
        """
        doc = self.nlp(text)
        # Remove stop words and punctuation, and apply lemmatization
        preprocessed_text = " ".join(
            [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        )
        return preprocessed_text

    def find_similar_entries(self, text: str, top_n: int = 5):
        """
        Embeds the input text using BERT, compares it with the entries in the CSV file,
        and returns the most similar entries based on cosine similarity.
        Args:
            text (str): The input text to embed and compare.
            top_n (int): The number of most similar entries to return.
        Returns:
            pd.DataFrame: The top-n most similar entries from the CSV.
        """
        # Preprocess the input text
        text = self.preprocess_text(text)

        # Generate embedding for the preprocessed text
        input_embedding = self.model.encode([text])

        # Load embeddings from the CSV
        if 'embedding' not in self.data.columns:
            raise ValueError("The CSV file must have an 'embedding' column.")

        # Convert embeddings from CSV into a list of arrays
        self.data['embedding'] = self.data['embedding'].apply(eval)  # Convert strings to lists
        embeddings = self.data['embedding'].tolist()

        # Compute cosine similarities
        similarities = cosine_similarity(input_embedding, embeddings)[0]
        self.data['similarity'] = similarities

        # Sort by similarity and return the top N results
        return input_embedding, self.data.sort_values(by='similarity', ascending=False).head(top_n)

    def process_and_save_embeddings(self, path: str, output_path: str):
        """
        Embeds the 'business_description' column from a new CSV file, keeps only 'tickers' and 'embedding',
        and saves the results in a new CSV with 'tickers' as the index.
        Args:
            path (str): The path to the CSV file to process.
            output_path (str): The path to save the output CSV.
        """
        # Load new data
        new_data = pd.read_csv(path)

        # Ensure required columns exist
        if 'tickers' not in new_data.columns:
            raise ValueError("The CSV file must have a 'tickers' column.")
        if 'business_description' not in new_data.columns:
            raise ValueError("The CSV file must have a 'business_description' column.")

        # Preprocess and embed the 'business_description' column
        new_data['processed_description'] = new_data['business_description'].apply(self.preprocess_text)
        new_data['embedding'] = new_data['processed_description'].apply(lambda x: self.model.encode([x])[0].tolist())

        # Keep only 'tickers' and 'embedding' columns
        processed_data = new_data[['tickers', 'embedding']]

        # Set 'tickers' as the index
        processed_data.set_index('tickers', inplace=True)

        # Save the processed data
        processed_data.to_csv(output_path)


### Creation of Embedding dataset
We create this in order for faster execution in our final user pripeline

In [24]:
# Define paths relative to the current working directory
INPUT_PATH = "../Dataset/Data/normalized_real_company_stock_dataset_large.csv"
OUTPUT_PATH = "Embeddings/embeddings.csv"

CREATE_DATASET = False
TEST = True

if __name__ == '__main__':
    if CREATE_DATASET:
        retrieval_system = RetrievalSystem(OUTPUT_PATH)
        retrieval_system.process_and_save_embeddings(INPUT_PATH, OUTPUT_PATH)

    if TEST:
        retrieval_system = RetrievalSystem(OUTPUT_PATH)
        idea = "Hello world program that can print hello world"
        idea = "American Assets Trust, Inc. is a full service, vertically integrated and self-administered real estate investment trust ('REIT'), headquartered in San Diego, California. The company has over 55 years of experience in acquiring, improving, developing and managing premier office, retail, and residential properties throughout the United States in some of the nation's most dynamic, high-barrier-to-entry markets primarily in Southern California, Northern California, Washington, Oregon, Texas and Hawaii. The company's office portfolio comprises approximately 4.1 million rentable square feet, and its retail portfolio comprises approximately 3.1 million rentable square feet. In addition, the company owns one mixed-use property (including approximately 94,000 rentable square feet of retail space and a 369-room all-suite hotel) and 2,110 multifamily units. In 2011, the company was formed to succeed to the real estate business of American Assets, Inc., a privately held corporation founded in 1967 and, as such, has significant experience, long-standing relationships and extensive knowledge of its core markets, submarkets and asset classes."
        result = retrieval_system.find_similar_entries(idea, 10)
        print(result)

(array([[-2.36731485e-01,  1.10098708e+00, -2.49906912e-01,
        -1.00031659e-01,  1.21953046e+00, -7.87494540e-01,
        -1.74020171e-01,  7.41648078e-02,  2.57950872e-01,
        -2.96539009e-01,  8.13798830e-02,  6.56717658e-01,
         7.00928330e-01,  2.65893847e-01, -8.28124642e-01,
         5.61761022e-01,  2.65168667e-01,  4.05123621e-01,
        -3.32676619e-01, -2.96762049e-01, -4.34615791e-01,
        -2.02156082e-01, -6.79084063e-02,  7.30916262e-01,
         1.29366648e+00,  9.76140738e-01, -1.06735885e-01,
        -4.20050323e-01, -7.03854322e-01,  5.98804951e-01,
        -6.80833161e-01, -9.59067345e-02, -3.26854736e-01,
        -5.69263995e-01,  5.78484416e-01,  5.47018707e-01,
        -9.58418310e-01, -1.02933943e-01, -3.77555117e-02,
        -4.89337519e-02,  1.52033001e-01, -7.49350846e-01,
         4.65142787e-01, -3.36361855e-01, -1.29280043e+00,
        -1.91346928e-03, -5.25442839e-01,  4.11848187e-01,
         9.13666904e-01, -1.00674510e+00,  5.47371209e-