In [None]:
! pip install yfinance langchain_pinecone openai python-dotenv langchain-community sentence_transformers

In [None]:
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
import dotenv
import json
import yfinance as yf
import concurrent.futures
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import numpy as np
import requests
import os
from dotenv import load_dotenv
load_dotenv()

In [2]:
def get_stock_info(symbol: str) -> dict:
    """
    Retrieves and formats detailed information about a stock from Yahoo Finance.

    Args:
        symbol (str): The stock ticker symbol to look up.

    Returns:
        dict: A dictionary containing detailed stock information, including ticker, name,
              business summary, city, state, country, industry, and sector.
    """
    headers = {
        'Authorization': f'Bearer {os.getenv("YAHOO_ACCESS_TOKEN")}'
    }
    session = requests.Session()
    session.headers.update(headers)
    
    data = yf.Ticker(symbol, session=session)
    # data = yf.Ticker(symbol)
    stock_info = data.info

    properties = {
        "Ticker": stock_info.get('symbol', 'Information not available'),
        'Name': stock_info.get('longName', 'Information not available'),
        'Business Summary': stock_info.get('longBusinessSummary'),
        'City': stock_info.get('city', 'Information not available'),
        'State': stock_info.get('state', 'Information not available'),
        'Country': stock_info.get('country', 'Information not available'),
        'Industry': stock_info.get('industry', 'Information not available'),
        'Sector': stock_info.get('sector', 'Information not available'),
        'Market Cap': stock_info.get('marketCap', 'Information not available'),
        'Volume': stock_info.get('volume', 'Information not available'),
    }

    return properties
    # return stock_info

In [3]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    """
    Generates embeddings for the given text using a specified Hugging Face model.

    Args:
        text (str): The input text to generate embeddings for.
        model_name (str): The name of the Hugging Face model to use.
                          Defaults to "sentence-transformers/all-mpnet-base-v2".

    Returns:
        np.ndarray: The generated embeddings as a NumPy array.
    """
    model = SentenceTransformer(model_name)
    return model.encode(text)


def cosine_similarity_between_sentences(sentence1, sentence2):
    """
    Calculates the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence for similarity comparison.
        sentence2 (str): The second sentence for similarity comparison.

    Returns:
        float: The cosine similarity score between the two sentences,
               ranging from -1 (completely opposite) to 1 (identical).

    Notes:
        Prints the similarity score to the console in a formatted string.
    """
    # Get embeddings for both sentences
    embedding1 = np.array(get_huggingface_embeddings(sentence1))
    embedding2 = np.array(get_huggingface_embeddings(sentence2))

    # Reshape embeddings for cosine_similarity function
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    similarity_score = similarity[0][0]
    print(f"Cosine similarity between the two sentences: {similarity_score:.4f}")
    return similarity_score


# Example usage
sentence1 = "I like walking to the park"
sentence2 = "I like running to the office"

similarity = cosine_similarity_between_sentences(sentence1, sentence2)

Cosine similarity between the two sentences: 0.6133


In [None]:
def get_company_tickers():
    """
    Downloads and parses the Stock ticker symbols from the GitHub-hosted SEC company tickers JSON file.

    Returns:
        dict: A dictionary containing company tickers and related information.

    Notes:
        The data is sourced from the official SEC website via a GitHub repository:
        https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json
    """
    # URL to fetch the raw JSON file from GitHub
    url = "https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json"

    # Making a GET request to the URL
    response = requests.get(url)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parse the JSON content directly
        company_tickers = json.loads(response.content.decode('utf-8'))

        # Optionally save the content to a local file for future use
        with open("company_tickers.json", "w", encoding="utf-8") as file:
            json.dump(company_tickers, file, indent=4)

        print("File downloaded successfully and saved as 'company_tickers.json'")
        return company_tickers
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        return None

company_tickers = get_company_tickers()

In [None]:
company_tickers

In [86]:
len(company_tickers)

9998

In [None]:
# from dotenv import load_dotenv
# load_dotenv()

pinecone_api_key = os.getenv('PINECONE_API_KEY')

index_name = "stocks"
namespace = "stock-descriptions"

hf_embeddings = HuggingFaceEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=hf_embeddings)

In [None]:
# Sequential Processing
# This is a slow process, so we will use a more efficient method later

# for idx, stock in company_tickers.items():
#     stock_ticker = stock['ticker']
#     stock_data = get_stock_info(stock_ticker)
#     stock_description = stock_data['Business Summary']

#     print(f"Processing stock {idx} / {len(company_tickers)} :", stock_ticker)

#     vectorstore_from_documents = PineconeVectorStore.from_documents(
#         documents=[Document(page_content=stock_description, metadata=stock_data)],
#         embedding=hf_embeddings,
#         index_name=index_name,
#         namespace=namespace
#     )

In [None]:
# This loads the existing successful tickers from the file
# If you want to start over, delete the successful_tickers.txt file


# Initialize tracking lists
successful_tickers = []
unsuccessful_tickers = []

# Load existing successful/unsuccessful tickers
try:
    with open('successful_tickers.txt', 'r') as f:
        successful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(successful_tickers)} successful tickers")
except FileNotFoundError:
    print("No existing successful tickers file found")

try:
    with open('unsuccessful_tickers.txt', 'r') as f:
        unsuccessful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(unsuccessful_tickers)} unsuccessful tickers")
except FileNotFoundError:
    print("No existing unsuccessful tickers file found")

In [None]:
# Check the number of CPU cores
# This is important for parallel processing
# Use all cores minus 1 as max_workers for parallel processing
# The less cores you have, the more you should reduce the batch size
# 35 is a good starting point for most machines with 8 cores -- 7 max_workers * 5 = batch size 35 for m1 macbook pros
import os
print(f"Number of CPU cores: {os.cpu_count()}")

In [None]:
# Parallel Processing
# This is a faster process, but it requires more memory

def process_stock(stock_ticker: str) -> str:
    
    if stock_ticker in successful_tickers:
        return f"Already processed {stock_ticker}"

    try:
        # Get and store stock data
        stock_data = get_stock_info(stock_ticker)
        if stock_data['Business Summary'] is None:
            stock_data['Business Summary'] = "No business summary available"
        stock_description = stock_data['Business Summary']

        vectorstore_from_texts = PineconeVectorStore.from_documents(
            documents=[Document(page_content=stock_description, metadata=stock_data)],
            embedding=hf_embeddings,
            index_name=index_name,
            namespace=namespace
        )

        # Track success
        with open('successful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        successful_tickers.append(stock_ticker)

        return f"Processed {stock_ticker} successfully"

    except Exception as e:
        # Track failure
        with open('unsuccessful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        unsuccessful_tickers.append(stock_ticker)

        return f"ERROR processing {stock_ticker}: {e}"

def parallel_process_stocks(tickers: list, batch_size=35, max_workers: int = 5) -> None:
    for i in range(0, len(tickers), batch_size):
        print(f"\nProcessing batch {i//batch_size + 1}")
        batch = tickers[i:i + batch_size]

        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_ticker = {
            executor.submit(process_stock, ticker): ticker
            for ticker in batch
        }

        for future in concurrent.futures.as_completed(future_to_ticker):
            ticker = future_to_ticker[future]
            try:
                result = future.result()
                print(result)

            except Exception as exc:
                print(f'ERROR processing {ticker}: {exc}')
                continue

# Prepare your tickers
tickers_to_process = [company_tickers[num]['ticker'] for num in company_tickers.keys()]

# Process them
parallel_process_stocks(tickers_to_process, max_workers=7)

In [None]:
# Remove tickers that were successfully processed from the unsuccessful_tickers.txt file
for ticker in unsuccessful_tickers:
  if ticker in successful_tickers:
    print(f"Removing {ticker} from unsuccessful_tickers.txt")
    with open('unsuccessful_tickers.txt', 'r') as f:
        lines = f.readlines()
    with open('unsuccessful_tickers.txt', 'w') as f:
        for line in lines:
            if line.strip() != ticker:
                f.write(line)

successful_tickers = []
unsuccessful_tickers = []

# Load existing successful/unsuccessful tickers
try:
    with open('successful_tickers.txt', 'r') as f:
        successful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(successful_tickers)} successful tickers")
except FileNotFoundError:
    print("No existing successful tickers file found")

try:
    with open('unsuccessful_tickers.txt', 'r') as f:
        unsuccessful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(unsuccessful_tickers)} unsuccessful tickers")
except FileNotFoundError:
    print("No existing unsuccessful tickers file found")

In [10]:

pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

pinecone_index = pc.Index(index_name)

In [11]:

load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

client = OpenAI(
  base_url="https://api.groq.com/openai/v1",
  api_key=groq_api_key
)

In [24]:
def run_streamlit():
  os.system("streamlit run app.py --server.port 8501")

In [40]:
%%writefile app.py
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
import json
import yfinance as yf
import concurrent.futures
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import numpy as np
import requests
import os
import streamlit as st
from dotenv import load_dotenv
load_dotenv()

pinecone_api_key = os.getenv('PINECONE_API_KEY')

index_name = "stocks"
namespace = "stock-descriptions"

hf_embeddings = HuggingFaceEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=hf_embeddings)


pc = Pinecone(api_key=pinecone_api_key)

pinecone_index = pc.Index(index_name)

groq_api_key = os.getenv('GROQ_API_KEY')

client = OpenAI(
  base_url="https://api.groq.com/openai/v1",
  api_key=groq_api_key
)

def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    model = SentenceTransformer(model_name)
    return model.encode(text)

def get_stock_info_all(symbol: str) -> dict:
    headers = {
        'Authorization': f'Bearer {os.getenv("YAHOO_ACCESS_TOKEN")}'
    }
    session = requests.Session()
    session.headers.update(headers)
    
    data = yf.Ticker(symbol, session=session)

    stock_info = data.info

    return stock_info

def format_filter_conditions(filter_conditions):
    if not filter_conditions:
        return ""
        
    formatted_filters = []
    
    for key, value in filter_conditions.items():
        if isinstance(value, dict):
           
            for op, val in value.items():
                operator_map = {
                    "$gte": "greater than or equal to",
                    "$lte": "less than or equal to",
                    "$gt": "greater than",
                    "$lt": "less than",
                    "$eq": "equals",
                    "$in": "in",
                }
                op_text = operator_map.get(op, op)
                formatted_filters.append(f"{key} is {op_text} {val}")
        else:
            
            formatted_filters.append(f"{key}: {value}")
    
    return ", ".join(formatted_filters)


def HandleQuery(query, filter_conditions):
    filter_conditions_string = format_filter_conditions(filter_conditions)
   
    raw_query_embedding = get_huggingface_embeddings(query)

    top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=10, include_metadata=True, namespace=namespace,filter=filter_conditions if filter_conditions else None)

    contexts = [item['metadata']['text'] for item in top_matches['matches']]

    augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query + filter_conditions_string

    system_prompt = f"""You are an expert at providing answers about stocks. Please answer my question provided.

    When giving your response, please do not mention the context provided to you or the query.

    Please provide a detailed answer to the question.

    Please provide all of the answers that you receive from the context provided.

    Please provide the answers from most relevant to least relevant.

    Please provide the answer in a markdown format.

    Please be consistent in the markdown format for all of your answers.

    If no question is provided, please provide a list of all of the stocks that match the filters and their information.
    """

    llm_response = client.chat.completions.create(
        model="llama-3.1-70b-versatile",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": augmented_query}
        ]
    )

    response = llm_response.choices[0].message.content
    return response

st.title('Stock Analysis')
st.warning("Keep in mind that more detailed your query and filters are, the more relevant and accurate the results will be.")

st.write("You can use the following filters to narrow down the results:")
st.write("Market Cap and Volume will return results that are greater than or equal to the value you enter.")
industry = st.text_input('Industry:',)
sector = st.text_input('Sector:',)
market_cap = st.number_input(
    'Market Cap:',
    min_value=0,
    max_value=1000000,
    step=1
)
volume = st.number_input(
    'Volume:',
    min_value=0,
    max_value=1000000,
    step=1
)

st.write("Ask general questions about stocks:")
query = st.text_input('Ask About Stocks:',)

filter = {
    "industry": industry,
    "sector": sector,
    "marketCap": {"$gte": market_cap},
    "volume": {"$gte": volume}
}

if st.button('Get Stock Info'):
    st.write(f'Getting info for {query}...')
    response = HandleQuery(query, filter)
    
    st.write("### Response:")
    st.write(response)
    
    
    st.markdown("---")
    
    if not response:
        st.error("No information found for this query.")
    

Overwriting app.py
