In [None]:
!apt-get update && apt-get install ffmpeg tmux vim -y
!pip3 install moviepy openai python-dotenv pydub pytubefix openai-whisper llama-index llama-index-llms-openai llama-index-llms-ollama llama-index-embeddings-ollama 
!pip3 install flash-attn --no-build-isolation
!curl -fsSL https://ollama.com/install.sh | sh

# !chmod +x /usr/bin/ollama
# !useradd -r -s /bin/false -m -d /usr/share/ollama ollama
# !pip3 install openpyxl sentencepiece protobuf evaluate rouge_score absl-py tensorboardX bitsandbytes peft accelerate python-dotenv dspy-ai graspologic


In [None]:
import os, pickle, gc, subprocess
from dotenv import load_dotenv
from video_transcription import split_audio_into_chunks, video_to_audio, transcribe_audio_chunks, get_transcription_model, download_video
from llama_index.core import Document
# import torch

load_dotenv("/workspace/repos/agentic-ai/.env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

video_path="/workspace/data/video1512218125.mp4"
# video_path="https://www.youtube.com/watch?v=Z07Ewop7rQA"
audio_output_path="/workspace/data/video_audio.mp3"
transcribe_output_dir="/workspace/data"
file_save_path="/workspace/data/transcription.pkl"

In [None]:
if not os.path.exists(file_save_path):
    print("Loading transcription model...")
    transcribe_model = get_transcription_model(open_source_model=True)

    print("Processing video...")
    if 'youtube' in video_path:
        print("   Downloading video from youtube...")
        download_video(video_url=video_path, audio_output_path=audio_output_path)
    else:
        video_to_audio(video_path=video_path, audio_output_path=audio_output_path)

    print("Splitting audio into chunks...")
    split_audio_into_chunks(audio_output_path=audio_output_path, transcribe_output_dir=transcribe_output_dir, max_chunk_size_mb=24)

    print("Transcribing audio chunks...")
    documents = transcribe_audio_chunks(model=transcribe_model, chunk_dir="/workspace/data", file_save_path=file_save_path)

    del transcribe_model
    # torch.cuda.empty_cache()
    gc.collect()
else:
    with open(file_save_path, 'rb') as f:
        documents = pickle.load(f)
    
if isinstance(documents[0], dict):
    documents = [Document(text=chunk["text"]) for chunk in documents]

full_doc = " ".join([doc.text for doc in documents])


In [None]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI as LOpenAI

# model_name, ctx_len = "gpt-4o", 128000
# model_name, ctx_len = "llama3.1:latest", 128000
model_name, ctx_len = "hermes3:8b", 128000

if model_name == "gpt-4o":
    llm = LOpenAI(model=model_name, max_tokens=4000)
else:
    try: 
        print("Pulling Ollama model...")
        sub_out = subprocess.run(['ollama', 'pull', model_name], capture_output=True, text=True)
    except Exception as e: 
        print(f"Error pulling model: Is the Ollama server running?\n{e}")
    
    addtion_kwargs = {"max_new_tokens": 4000}
    # system_prompt = "You are an expert at answering questions about rules and regulations regarding Title 17—Commodity and Securities Exchanges: CHAPTER II—SECURITIES AND EXCHANGE COMMISSION. Please provide a summary of the following text, and cite any sections, rules, acts or laws (e.g. § 230.503, § 240.13a-15, Act (15 U.S.C. 781), Investment Company Act of 1940) from context that support the answer. Be detailed in your response."
    llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
                request_timeout=1000.0, additional_kwargs=addtion_kwargs) # system_prompt=system_prompt
    print(llm.metadata)

Settings.llm = llm

## Tree summarizer

In [None]:
from llama_index.core.response_synthesizers import TreeSummarize
summarizer = TreeSummarize(llm=llm, verbose=True)
# prompt_summary = "You are a professional executive of AlphaTrAI. Your job is to summarize this text in great detail from a video transcription. The summary will be distributed to investors and stakeholders, so give a lot of details and examples from the transcription."
prompt_summary = f"""You are a professional executive at AlphaTrAI. Your job is to summarize the text from a video transcription. The summary will be a memo distributed to investors and stakeholders. Be sure it the memo has the following items:
1. Extract all the names of new hires and their position, and/or new advisors mentioned in the transcription.
2. Create a section to mention the personnel new to AlphaTrAI.
3. Include other highlights and progress made by AlphaTrAI.
4. Ensure the memo and ensure it is factual, optimistic, and any values mention come directly from the text. 

The transcription is as follows:\n{full_doc}"""

response = await summarizer.aget_response(prompt_summary, [doc.text for doc in documents])

In [None]:
print(response)

## LLM direct summarization

In [None]:
prompt_summary = f"""You are a professional executive at AlphaTrAI. Your job is to summarize the text from a video transcription. The summary will be a memo distributed to investors and stakeholders. Be sure it the memo has the following items:
1. Extract all the names of new hires and their position, and/or new advisors mentioned in the transcription.
2. Include other highlights and progress made by AlphaTrAI.
3. Ensure the memo is professional, fluid, factual, and optimistic. 

The transcription is as follows:\n{full_doc}"""

response = llm.complete(prompt_summary, max_tokens=5000)

In [None]:
print(response.text)

## Agentic Summary

In [None]:
!pip3 install llama-index-embeddings-huggingface llama-index-vector-stores-neo4jvector llama-index-graph-stores-neo4j
!apt install dialog apt-utils -y (done above)
!wget -O - https://debian.neo4j.com/neotechnology.gpg.key | gpg --dearmor -o /etc/apt/keyrings/neotechnology.gpg
!echo 'deb [signed-by=/etc/apt/keyrings/neotechnology.gpg] https://debian.neo4j.com stable latest' | tee -a /etc/apt/sources.list.d/neo4j.list
!apt list -a neo4j
!add-apt-repository universe -y
!apt install neo4j=1:5.22.0 -y
!echo "neo4j-enterprise neo4j/question select I ACCEPT" | debconf-set-selections
!echo "neo4j-enterprise neo4j/license note" | debconf-set-selections
!apt install openjdk-17-jre -y
!cd /var/lib/neo4j/plugins/ && wget https://github.com/neo4j/apoc/releases/download/5.22.0/apoc-5.22.0-core.jar

In [None]:
set_neo4j_password('bewaretheneo')
add_lines_to_conf()


In [None]:
from dotenv import load_dotenv
load_dotenv()

from llama_index.core.agent import ReActAgent
# from llama_index.llms.openai import OpenAI
from llama_index.core.tools import FunctionTool
from rag_utils import create_neo4j_graph_store, create_neo4j_graphrag, neo4j_query, set_neo4j_password, add_lines_to_conf
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

import nest_asyncio
nest_asyncio.apply()



In [None]:

llm.is_function_calling_model = True

embed_model_name = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
print("loading embed model...")
embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

Settings.embed_model = embed_model
Settings.chunk_size = 300
Settings.chunk_overlap = 50

In [None]:
from typing import Literal
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor

entities = Literal["PEOPLE", 
                   "PLACE"
]

relations = Literal[
    "ROLE",
    "COMPANY"
]

validation_schema = {
    "People": ["ROLE"],
    "Place": ["COMPANY"],
}


In [None]:
Settings.chunk_size = 300
Settings.chunk_overlap = 50

kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    strict=True,  # if false, will allow triples outside of the schema
    num_workers=4,
    max_triplets_per_chunk=10,
)

graph_store = create_neo4j_graph_store(neo_url="bolt://localhost:7687", 
                                       password=os.getenv("NEO4J_PWD"), 
                                       config={"connection_timeout": 240, "connection_acquisition_timeout": 240, "max_connection_pool_size": 1000})
neo4j_query(graph_store, query="""MATCH (n) DETACH DELETE n""")


graph_index = create_neo4j_graphrag(documents, llm, embed_model, kg_extractor, graph_store)

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

query_engine_tools = QueryEngineTool(
            query_engine=graph_index,
            metadata=ToolMetadata(
                name="graph_tool",
                description=(
                    "Useful for finding people names and roles, and the company they work for."
                ),
            ),
        ),


In [None]:
!pip3 install llama-agents

In [None]:
from llama_agents import (
    AgentService,
    ToolService,
    MetaServiceTool,
    ControlPlaneServer,
    SimpleMessageQueue,
    AgentOrchestrator,
)

from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import ReActAgentWorker, ReActAgent



# create our multi-agent framework components
message_queue = SimpleMessageQueue()
control_plane = ControlPlaneServer(
    message_queue=message_queue,
    orchestrator=AgentOrchestrator(llm=llm),
)

# define Tool Service
tool_service = ToolService(
    message_queue=message_queue,
    tools=[query_engine_tools],#, adding_tool],
    running=True,
    step_interval=0.5,
)

# define meta-tools here
meta_tools = [
    await MetaServiceTool.from_tool_service(
        t.metadata.name,
        message_queue=message_queue,
        tool_service=tool_service,
    )
    for t in [query_engine_tools]#, adding_tool]
]


# define Agent and agent service
# worker1 = FunctionCallingAgentWorker.from_tools(
worker1 = ReActAgentWorker.from_tools(
    meta_tools,
    llm=llm,
)
agent1 = worker1.as_agent()
agent_server_1 = AgentService(
    agent=agent1,
    message_queue=message_queue,
    description="Summarize a transcription as a memo for investors and stakeholders.",
    service_name="summarize_transcription",
)

In [None]:
asset_list = ["Vanguard Group",	"BlackRock",	"State Street Global",	"Fidelity Investments",	"BNY Mellon",	"Goldman Sachs Group",	"J.P. Morgan Asset & Wealth",	"Legal & General Investment",	"Wellington Mgmt.",	"Amundi",	"Prudential Financial",	"Geode Capital Mgmt.",	"PIMCO",	"Northern Trust Asset Mgmt.",	"Nuveen",	"Capital Group",	"T. Rowe Price Associates",	"AXA Investment",	"Franklin Templeton",	"Federated Hermes",	"Invescos",	"Dimensional Fund Advisors",	"MetLife Investment Mgmt.",	"Morgan Stanley Inv. Mgmt.",	"New York Life Investments",	"Schroders",	"Principal Global Investors",	"KKR",	"DWS",	"Macquarie Asset Mgmt.",	"Brookfield Asset Mgmt.",	"Allspring Global Investments",	"BNP Paribas Asset Mgmt.",	"Asset Management One",	"Mercer",	"Barings",	"Aegon Asset Mgmt.",	"AllianceBernstein",	"Neuberger Berman",	"Ares Mgmt.",	"Columbia Threadneedle",	"NISA Investment",	"Voya Investment Mgmt.",	"MassMutual",	"Aviva Investors",	"SEI Investments",	"Manulife Investment",	"SLC Management",	"Russell Investments",	"Loomis, Sayles",	"Baillie Gifford Overseas",	"Dodge & Cox",	"TCW Group",	"MFS Investment",	"RBC Global Asset Mgmt.",	"Mesirow",	"Guggenheim Investments",	"Wilmington Trust",	"WTW Investment Services",	"Conning",	"Empower Investments",	"PFM Asset Mgmt.",	"CBRE Investment Mgmt.",	"TD Global Invest. Solutions",	"IFM Investors",	"Arrowstreet Capital",	"Nomura Asset Mgmt.",	"Oaktree Capital",	"Payden & Rygel",	"Lazard Asset Mgmt.",	"Victory Capital",	"Artisan Partners",	"PRIMECAP",	"Man Group",	"Robeco",	"Baird Advisors",	"Hamilton Lane",	"Ninety One",	"Partners Group",	"Starwood Capital",	"AQR Capital Mgmt.",	"Acadian Asset Mgmt.",	"Prologis",	"LSV Asset Mgmt.",	"StepStone Group",	"American Century",	"Charles Schwab Investment",	"RhumbLine Advisers",	"Pathway Capital",	"Boston Partners",	"Record Currency Mgmt.",	"Income Research & Mgmt.",	"AEW Capital",	"Pictet Asset Mgmt.",	"First Sentier Investors",	"Hines",	"New England Asset Mgmt.",	"Alan Biller and Associates",	"PPM America",	"LaSalle Investment",	"GCM Grosvenor",	"PineBridge Investments",	"PNC Financial",	"CC&L Financial Group",	"Fort Washington",	"Dai-ichi Life Holdings",	"Oak Hill Advisors",	"William Blair",	"Putnam Investments",	"Ashmore Group",	"Heitman",	"Harrison Street",	"Grantham, Mayo v. Otterloo",	"PAG",	"Harris Associates",	"Adams Street Partners",	"Sterling Capital",	"GoldenTree Asset Mgmt.",	"Mondrian Investment",	"Angelo, Gordon",	"Nikko Asset Mgmt.",	"Harding Loevner",	"Brown Advisory",	"Portfolio Advisors",	"Fisher Investments",	"Cohen & Steers",	"Marathon-London",	"Harbor Capital Advisors",	"Aristotle Capital Mgmt.",	"SECOR Asset Mgmt.",	"Stockbridge Capital Group",	"PanAgora Asset Mgmt.",	"Pzena Investment",	"Causeway Capital",	"Colchester Global Investors",	"MissionSquare Investments",	"Hayfin Capital Mgmt.",	"ORIX USA",	"CIBC Asset Mgmt.",	"Los Angeles Capital",	"Shenkman Group",	"Jarislowsky Fraser",	"EARNEST Partners",	"Knights of Columbus Asset",	"Strategic Investment Group",	"Commonfund",	"Rockpoint Group",	"Hotchkis & Wiley",	"AAM",	"CIM Group*",	"Beutel, Goodman",	"Nomura Corporate Research",	"Scout Investments",	"Calamos Advisors",	"ACORE Capital",	"PCCP",	"Guardian Capital",	"DuPont Capital",	"Canyon Partners",	"Kayne Anderson Rudnick",	"Polen Capital",	"TA Realty",	"Sustainable Growth Advisers",	"MFG Asset Mgmt.",	"Unigestion",	"Intech",	"Eagle Capital",	"Garcia Hamilton",	"Sprucegrove Investment",	"Longfellow Investment",	"Axiom Investors",	"King Street Capital",	"Wasatch Global Investors",	"Boyd Watterson",	"Champlain Investment",	"Crestline Investors",	"Callan",	"Pacific Asset Mgmt.",	"Fuller & Thaler",	"Cantillon Capital Mgmt.",	"Jacobs Levy Equity",	"Brandes Investment",	"Fayez Sarofim",	"Sit Investment",	"Cliffwater",	"Intercontinental Real Estate",	"Walton Street Capital",	"Beacon Capital",	"Rockwood Capital",	"Breckinridge Capital",	"Beach Point Capital",	"Amalgamated Bank",	"American Realty Advisors",	"Abbott Capital",	"Eagle Asset Mgmt.",	"Westfield Capital",	"Driehaus Capital",	"CenterSquare Investment",	"Segall Bryant & Hamill",	"Polaris Capital",	"Grayscale Investments",	"GW&K Investment",	"CornerStone Partners",	"Westbrook Partners",	"Bahl & Gaynor",	"Sage Advisory Services",	"Yousif Capital",	"Commerce Trust",	"Zacks Investment",	"Stone Harbor Investment",	"Brown Capital",	"Cooke & Bieler",	"L&B Realty",	"Jensen Investment",	"Burgundy Asset Mgmt.",	"Pugh Capital",	"Mill Creek Residential",	"Global Endowment Mgmt.",	"Ullico Investment",	"London Co.",	"GAMCO Investors",	"Matthews Asia",	"Capital Fund Mgmt.",	"Waterfall Asset Mgmt.",	"Westwood Global",	"Frontier Capital",	"Christian Brothers",	"Manning & Napier",	"Ariel Investments",	"Washington Capital",	"TimesSquare Capital",	"Ramirez Asset Mgmt.",	"Altrinsic Global Advisors",	"National Real Estate",	"Advent Capital",	"CS McKee",	"WEDGE Capital",	"Newfleet Asset Mgmt.",	"National Investment",	"TT International",	"Prima Capital Advisors",	"Alger",	"Silver Creek Capital",	"River Road Asset Mgmt.",	"Agincourt Capital",	"Diamond Hill Capital",	"AGF Investments",	"Sentinel Real Estate",	"Ceredex Value Advisors",	"CoreCommodity",	"LCM Partners",	"Madison Realty",	"Silvercrest Asset Mgmt.",	"White Oak Global Advisors",	"Luther King Capital",	"Equus Capital",	"Hardman Johnston Global",	"AFL-CIO Housing Trust",	"Corbin Capital",	"City of London",	"Spider Mgmt.",	"M3 Capital",	"Davis Advisors",	"Torchlight Investors",	"Stephens Inv. Mgmt. Group",	"Great Lakes Advisors",	"Congress Asset Mgmt.",	"Parnassus Investments",	"Dana Investment",	"Martingale Asset Mgmt.",	"Madison Investments",	"Richmond Capital",	"Camden Asset Mgmt.",	"400 Capital Mgmt.",	"Glenmede Investment",	"Lyrical Asset Mgmt.",	"Gramercy",	"D.F. Dent",	"Resource Mgmt.",	"DePrince, Race & Zollo",	"Fiduciary Mgmt./Milwaukee",	"Duff & Phelps",	"AFL-CIO Building Trust",	"Johnson Asset Mgmt.",	"LM Capital Group",	"Conestoga Capital",	"Sierra Investment",	"Baird Equity Asset Mgmt.",	"Forest Investment",	"Carmel Partners",	"Atalanta Sosnoff Capital",	"Jackson Square Partners",	"Peregrine Capital",	"Todd Asset Mgmt.",	"Hoisington Investment",	"GlobeFlex Capital",	"Kornitzer Capital",	"Patron Capital",	"Emerald Advisers",	"Mar Vista Investment",	"Stacey Braun Associates",	"Leading Edge Investment",	"Kennedy Capital",	"Security Capital Research",	"Riverbridge Partners",	"Cardinal Capital",	"Granahan Investment",	"Dolan McEniry",	"Angel Oak Capital",	"Global Forest Partners",	"Channing Capital",	"ClariVest Asset Mgmt.",	"Twin Bridge Capital",	"Evanston Capital",	"Aristotle Capital Boston",	"Systematic Financial",	"Palisade Capital",	"Algert Global",	"Hillsdale Investment",	"Prime Group",	"3650 REIT",	"GTIS Partners",	"Bivium Capital",	"Molpus Woodlands Group",	"Winthrop Capital",	"Zevenbergen Capital",	"Dalton Investments",	"Hood River Capital",	"Trillium Asset Mgmt.",	"Bowen, Hanes",	"EAM Investors",	"Verger Capital",	"GIA Partners",	"Weatherbie Capital",	"Oberweis Asset Mgmt.",	"Zazove Associates",	"Wexford Capital",	"Timberland Inv. Resources",	"Sawgrass Asset Mgmt.",	"SSI Investment",	"NewSouth Capital",	"Foundry Partners",	"Sound Shore Mgmt.",	"KBS",	"HS Management",	"Silvant Capital",	"Millstreet Capital",	"Redwood Investments",	"Genter Capital",	"Smith Group Asset Mgmt.",	"Sarofim Realty",	"Edgar Lomax",	"Covenant Capital Group",	"Heartland Advisors",	"Adelante Capital",	"Rice Hall James",	"Cramer Rosenthal McGlynn",	"Quest Investment",	"Penn Capital",	"Ranger Investments",	"Associated Capital Group",	"Cornerstone Investment",	"Smith Affiliated Capital",	"Logan Capital",	"Wilbanks, Smith & Thomas",	"Orleans Capital",	"Thornburg Investment",	"Karpus Investment",	"StoneRidge PMG Advisors",	"Tributary Capital",	"Mairs & Power",	"Bridgeway Capital",	"Granite Investment",	"Ativo Capital Mgmt.",	"Nicholas Investment",	"Sasco Capital",	"CS Capital",	"TWIN Capital",	"CI Inverness",	"Miller/Howard Investments",	"Belle Haven Investments",	"Montag & Caldwell",	"Anchor Capital",	"Wedgewood Partners",	"Wright Investors' Service",	"Phocas Financial",	"TSCG Investors",	"Pier Capital",	"GLOBALT",	"Van Hulzen Asset Mgmt.",	"SKBA Capital Mgmt.",	"Domain Timber Advisors",	"Speece Thorson Capital",	"Redstone Advisors",	"Aristotle Credit Partners",	"TerraCotta Group",	"Farr, Miller & Washington",	"SouthernSun Asset Mgmt.",	"Gifford Fong Associates",	"Denali Advisors",	"KDP Asset Mgmt.",	"AMI Asset Mgmt.",	"Semper Capital",	"Renaissance Investment",	"ZWJ Investment Counsel",	"Campbell Newman Asset",	"Gateway Investment",	"SMH Capital Advisors",	"Argent Capital",	"Chicago Capital",	"Osborne Partners",	"Oak Associates",	"Windham Capital",	"Bridge City",	"Strategy Asset Managers",	"Kingdon Capital",	"Glovista Investments",	"Winslow Asset Mgmt.",	"Hahn Capital",	"Affinity Investment Advisors",	"Teton Advisors",	"Abner, Herrman & Brock",	"NovaPoint Capital",	"Paradigm Capital",	"Flippin, Bruce & Porter",	"Kestrel Investment",	"Tom Johnson Investment",	"Argus Investors' Counsel",	"Branson, Fowlkes/Russell",	"Robinson Value Mgmt.",	"Chase Investment Counsel",	"Nicholas Co.",	"Cadinha",	"Pacific West Land",]
import os, re, time
import numpy as np
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')
from llama_index.llms.openai import OpenAI as LOpenAI
from llama_index.retrievers.you import YouRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Settings

model_name, ctx_len = "gpt-4o-2024-08-06", 128000
you_api_key = os.environ["YDC_API_KEY"]

# openai_key = os.getenv("OPENAI_API_KEY")
# os.environ["OPENAI_API_KEY"] = openai_key

# llm = LOpenAI(model=model_name, max_tokens=8000)
# Settings.llm = llm

from llama_index.llms.ollama import Ollama

model_name, ctx_len = "llama3.1:8b-instruct-q8_0", 128000
addtion_kwargs = {"max_new_tokens": 200} #, 'temperature': 0.8}
prompt = "Your job is to extract information from the given text. Be concise in your answers. Do not provide any information that is not in the text."
llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
                request_timeout=4000.0, **addtion_kwargs) #, system_prompt=system_prompt) additional_kwargs=addtion_kwargs,

retriever = YouRetriever(api_key=you_api_key, endpoint="search", num_web_results=3)
query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

ceos=[]
coos=[]
cios=[]
presidents=[]
execs = [ceos, coos, cios, presidents]
exec_strs = ["Chief Executive Officer (CEO)", "Chief Operating Officer (COO)", "Chief Investment Officer (CIO)", "President", "10-Year Performance", "Key Funds", "Assets Under Management (AUM)", "Primary Address"]
results = []

start=time.time()
for i,a in enumerate(asset_list):
    company_execs=[a]
    for exec_type, exec_str in zip(execs, exec_strs[:4]):
        system_prompt = f"""
        Who is the current {exec_str} at {a}?
        Only return the name.
        """
        response = query_engine.query(system_prompt)
        # print(str(response))
        # response = llm.complete(system_prompt)
        # response_check = llm.complete(f"Extract and only output the name in the following text.\n\n{response.text}")
        company_execs.append(response.response)
        
    return_prompt=f"""What is the 10-year annualized return for {a}? Return the annualized percentage in this format: "10-year return: 13.5%"."""
    company_execs.append(query_engine.query(return_prompt).response)
    funds_prompt = f"""What are the key funds for {a}? Only return the names of the funds."""
    company_execs.append(query_engine.query(funds_prompt).response)
    aum_prompt = f"""What is the current Assets Under Management (AUM) for {a}? Only return the total amount in USD."""
    company_execs.append(query_engine.query(aum_prompt).response)
    address_prompt = f"""What is the primary address for {a}? Only return the address."""
    company_execs.append(query_engine.query(address_prompt).response)

    if i%25==0:
        end=time.time()
        print(f"{np.round((end-start)/60, 2)} Processed {i+1}/{len(asset_list)} companies...")
    
    results.append(company_execs)


In [None]:
import csv

# Sample list of lists


# Specify the file name
filename = '/workspace/data/asset_execs2.csv'
header = ["Company Name"]+exec_strs
# Open the file in write mode
with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write each row to the CSV file
    for row in [header]+results:
        writer.writerow(row)

print(f"Data has been written to {filename}")

In [37]:
asset_list = ["Vanguard Group",	"BlackRock",	"State Street Global",	"Fidelity Investments",	"BNY Mellon",	"Goldman Sachs Group",	"J.P. Morgan Asset & Wealth",	"Legal & General Investment",	"Wellington Mgmt.",	"Amundi",	"Prudential Financial",	"Geode Capital Mgmt.",	"PIMCO",	"Northern Trust Asset Mgmt.",	"Nuveen",	"Capital Group",	"T. Rowe Price Associates",	"AXA Investment",	"Franklin Templeton",	"Federated Hermes",	"Invescos",	"Dimensional Fund Advisors",	"MetLife Investment Mgmt.",	"Morgan Stanley Inv. Mgmt.",	"New York Life Investments",	"Schroders",	"Principal Global Investors",	"KKR",	"DWS",	"Macquarie Asset Mgmt.",	"Brookfield Asset Mgmt.",	"Allspring Global Investments",	"BNP Paribas Asset Mgmt.",	"Asset Management One",	"Mercer",	"Barings",	"Aegon Asset Mgmt.",	"AllianceBernstein",	"Neuberger Berman",	"Ares Mgmt.",	"Columbia Threadneedle",	"NISA Investment",	"Voya Investment Mgmt.",	"MassMutual",	"Aviva Investors",	"SEI Investments",	"Manulife Investment",	"SLC Management",	"Russell Investments",	"Loomis, Sayles",	"Baillie Gifford Overseas",	"Dodge & Cox",	"TCW Group",	"MFS Investment",	"RBC Global Asset Mgmt.",	"Mesirow",	"Guggenheim Investments",	"Wilmington Trust",	"WTW Investment Services",	"Conning",	"Empower Investments",	"PFM Asset Mgmt.",	"CBRE Investment Mgmt.",	"TD Global Invest. Solutions",	"IFM Investors",	"Arrowstreet Capital",	"Nomura Asset Mgmt.",	"Oaktree Capital",	"Payden & Rygel",	"Lazard Asset Mgmt.",	"Victory Capital",	"Artisan Partners",	"PRIMECAP",	"Man Group",	"Robeco",	"Baird Advisors",	"Hamilton Lane",	"Ninety One",	"Partners Group",	"Starwood Capital",	"AQR Capital Mgmt.",	"Acadian Asset Mgmt.",	"Prologis",	"LSV Asset Mgmt.",	"StepStone Group",	"American Century",	"Charles Schwab Investment",	"RhumbLine Advisers",	"Pathway Capital",	"Boston Partners",	"Record Currency Mgmt.",	"Income Research & Mgmt.",	"AEW Capital",	"Pictet Asset Mgmt.",	"First Sentier Investors",	"Hines",	"New England Asset Mgmt.",	"Alan Biller and Associates",	"PPM America",	"LaSalle Investment",	"GCM Grosvenor",	"PineBridge Investments",	"PNC Financial",	"CC&L Financial Group",	"Fort Washington",	"Dai-ichi Life Holdings",	"Oak Hill Advisors",	"William Blair",	"Putnam Investments",	"Ashmore Group",	"Heitman",	"Harrison Street",	"Grantham, Mayo v. Otterloo",	"PAG",	"Harris Associates",	"Adams Street Partners",	"Sterling Capital",	"GoldenTree Asset Mgmt.",	"Mondrian Investment",	"Angelo, Gordon",	"Nikko Asset Mgmt.",	"Harding Loevner",	"Brown Advisory",	"Portfolio Advisors",	"Fisher Investments",	"Cohen & Steers",	"Marathon-London",	"Harbor Capital Advisors",	"Aristotle Capital Mgmt.",	"SECOR Asset Mgmt.",	"Stockbridge Capital Group",	"PanAgora Asset Mgmt.",	"Pzena Investment",	"Causeway Capital",	"Colchester Global Investors",	"MissionSquare Investments",	"Hayfin Capital Mgmt.",	"ORIX USA",	"CIBC Asset Mgmt.",	"Los Angeles Capital",	"Shenkman Group",	"Jarislowsky Fraser",	"EARNEST Partners",	"Knights of Columbus Asset",	"Strategic Investment Group",	"Commonfund",	"Rockpoint Group",	"Hotchkis & Wiley",	"AAM",	"CIM Group*",	"Beutel, Goodman",	"Nomura Corporate Research",	"Scout Investments",	"Calamos Advisors",	"ACORE Capital",	"PCCP",	"Guardian Capital",	"DuPont Capital",	"Canyon Partners",	"Kayne Anderson Rudnick",	"Polen Capital",	"TA Realty",	"Sustainable Growth Advisers",	"MFG Asset Mgmt.",	"Unigestion",	"Intech",	"Eagle Capital",	"Garcia Hamilton",	"Sprucegrove Investment",	"Longfellow Investment",	"Axiom Investors",	"King Street Capital",	"Wasatch Global Investors",	"Boyd Watterson",	"Champlain Investment",	"Crestline Investors",	"Callan",	"Pacific Asset Mgmt.",	"Fuller & Thaler",	"Cantillon Capital Mgmt.",	"Jacobs Levy Equity",	"Brandes Investment",	"Fayez Sarofim",	"Sit Investment",	"Cliffwater",	"Intercontinental Real Estate",	"Walton Street Capital",	"Beacon Capital",	"Rockwood Capital",	"Breckinridge Capital",	"Beach Point Capital",	"Amalgamated Bank",	"American Realty Advisors",	"Abbott Capital",	"Eagle Asset Mgmt.",	"Westfield Capital",	"Driehaus Capital",	"CenterSquare Investment",	"Segall Bryant & Hamill",	"Polaris Capital",	"Grayscale Investments",	"GW&K Investment",	"CornerStone Partners",	"Westbrook Partners",	"Bahl & Gaynor",	"Sage Advisory Services",	"Yousif Capital",	"Commerce Trust",	"Zacks Investment",	"Stone Harbor Investment",	"Brown Capital",	"Cooke & Bieler",	"L&B Realty",	"Jensen Investment",	"Burgundy Asset Mgmt.",	"Pugh Capital",	"Mill Creek Residential",	"Global Endowment Mgmt.",	"Ullico Investment",	"London Co.",	"GAMCO Investors",	"Matthews Asia",	"Capital Fund Mgmt.",	"Waterfall Asset Mgmt.",	"Westwood Global",	"Frontier Capital",	"Christian Brothers",	"Manning & Napier",	"Ariel Investments",	"Washington Capital",	"TimesSquare Capital",	"Ramirez Asset Mgmt.",	"Altrinsic Global Advisors",	"National Real Estate",	"Advent Capital",	"CS McKee",	"WEDGE Capital",	"Newfleet Asset Mgmt.",	"National Investment",	"TT International",	"Prima Capital Advisors",	"Alger",	"Silver Creek Capital",	"River Road Asset Mgmt.",	"Agincourt Capital",	"Diamond Hill Capital",	"AGF Investments",	"Sentinel Real Estate",	"Ceredex Value Advisors",	"CoreCommodity",	"LCM Partners",	"Madison Realty",	"Silvercrest Asset Mgmt.",	"White Oak Global Advisors",	"Luther King Capital",	"Equus Capital",	"Hardman Johnston Global",	"AFL-CIO Housing Trust",	"Corbin Capital",	"City of London",	"Spider Mgmt.",	"M3 Capital",	"Davis Advisors",	"Torchlight Investors",	"Stephens Inv. Mgmt. Group",	"Great Lakes Advisors",	"Congress Asset Mgmt.",	"Parnassus Investments",	"Dana Investment",	"Martingale Asset Mgmt.",	"Madison Investments",	"Richmond Capital",	"Camden Asset Mgmt.",	"400 Capital Mgmt.",	"Glenmede Investment",	"Lyrical Asset Mgmt.",	"Gramercy",	"D.F. Dent",	"Resource Mgmt.",	"DePrince, Race & Zollo",	"Fiduciary Mgmt./Milwaukee",	"Duff & Phelps",	"AFL-CIO Building Trust",	"Johnson Asset Mgmt.",	"LM Capital Group",	"Conestoga Capital",	"Sierra Investment",	"Baird Equity Asset Mgmt.",	"Forest Investment",	"Carmel Partners",	"Atalanta Sosnoff Capital",	"Jackson Square Partners",	"Peregrine Capital",	"Todd Asset Mgmt.",	"Hoisington Investment",	"GlobeFlex Capital",	"Kornitzer Capital",	"Patron Capital",	"Emerald Advisers",	"Mar Vista Investment",	"Stacey Braun Associates",	"Leading Edge Investment",	"Kennedy Capital",	"Security Capital Research",	"Riverbridge Partners",	"Cardinal Capital",	"Granahan Investment",	"Dolan McEniry",	"Angel Oak Capital",	"Global Forest Partners",	"Channing Capital",	"ClariVest Asset Mgmt.",	"Twin Bridge Capital",	"Evanston Capital",	"Aristotle Capital Boston",	"Systematic Financial",	"Palisade Capital",	"Algert Global",	"Hillsdale Investment",	"Prime Group",	"3650 REIT",	"GTIS Partners",	"Bivium Capital",	"Molpus Woodlands Group",	"Winthrop Capital",	"Zevenbergen Capital",	"Dalton Investments",	"Hood River Capital",	"Trillium Asset Mgmt.",	"Bowen, Hanes",	"EAM Investors",	"Verger Capital",	"GIA Partners",	"Weatherbie Capital",	"Oberweis Asset Mgmt.",	"Zazove Associates",	"Wexford Capital",	"Timberland Inv. Resources",	"Sawgrass Asset Mgmt.",	"SSI Investment",	"NewSouth Capital",	"Foundry Partners",	"Sound Shore Mgmt.",	"KBS",	"HS Management",	"Silvant Capital",	"Millstreet Capital",	"Redwood Investments",	"Genter Capital",	"Smith Group Asset Mgmt.",	"Sarofim Realty",	"Edgar Lomax",	"Covenant Capital Group",	"Heartland Advisors",	"Adelante Capital",	"Rice Hall James",	"Cramer Rosenthal McGlynn",	"Quest Investment",	"Penn Capital",	"Ranger Investments",	"Associated Capital Group",	"Cornerstone Investment",	"Smith Affiliated Capital",	"Logan Capital",	"Wilbanks, Smith & Thomas",	"Orleans Capital",	"Thornburg Investment",	"Karpus Investment",	"StoneRidge PMG Advisors",	"Tributary Capital",	"Mairs & Power",	"Bridgeway Capital",	"Granite Investment",	"Ativo Capital Mgmt.",	"Nicholas Investment",	"Sasco Capital",	"CS Capital",	"TWIN Capital",	"CI Inverness",	"Miller/Howard Investments",	"Belle Haven Investments",	"Montag & Caldwell",	"Anchor Capital",	"Wedgewood Partners",	"Wright Investors' Service",	"Phocas Financial",	"TSCG Investors",	"Pier Capital",	"GLOBALT",	"Van Hulzen Asset Mgmt.",	"SKBA Capital Mgmt.",	"Domain Timber Advisors",	"Speece Thorson Capital",	"Redstone Advisors",	"Aristotle Credit Partners",	"TerraCotta Group",	"Farr, Miller & Washington",	"SouthernSun Asset Mgmt.",	"Gifford Fong Associates",	"Denali Advisors",	"KDP Asset Mgmt.",	"AMI Asset Mgmt.",	"Semper Capital",	"Renaissance Investment",	"ZWJ Investment Counsel",	"Campbell Newman Asset",	"Gateway Investment",	"SMH Capital Advisors",	"Argent Capital",	"Chicago Capital",	"Osborne Partners",	"Oak Associates",	"Windham Capital",	"Bridge City",	"Strategy Asset Managers",	"Kingdon Capital",	"Glovista Investments",	"Winslow Asset Mgmt.",	"Hahn Capital",	"Affinity Investment Advisors",	"Teton Advisors",	"Abner, Herrman & Brock",	"NovaPoint Capital",	"Paradigm Capital",	"Flippin, Bruce & Porter",	"Kestrel Investment",	"Tom Johnson Investment",	"Argus Investors' Counsel",	"Branson, Fowlkes/Russell",	"Robinson Value Mgmt.",	"Chase Investment Counsel",	"Nicholas Co.",	"Cadinha",	"Pacific West Land",]
import os, re
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')
from llama_index.llms.openai import OpenAI as LOpenAI
from llama_index.retrievers.you import YouRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

model_name, ctx_len = "gpt-4o-2024-08-06", 128000
you_api_key = os.environ["YDC_API_KEY"]

openai_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_key

model_name, ctx_len = "llama3.1:8b-instruct-q8_0", 128000
addtion_kwargs = {"max_new_tokens": 8000, 'temperature': 0.9}
llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
                request_timeout=4000.0, **addtion_kwargs) #, system_prompt=system_prompt) additional_kwargs=addtion_kwargs,
# llm = LOpenAI(model=model_name, max_tokens=8000)
# Settings.llm = llm

retriever = YouRetriever(api_key=you_api_key, endpoint="search", num_web_results=3)
# query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

ceos=[]
coos=[]
cios=[]
presidents=[]
execs = [ceos, coos, cios, presidents]

In [None]:
collect=[]
# exec_strs = ["Fund Manager", "Chief Executive Officer", "Chief Operating Officer", "Chief Investment Officer", "President", "Performance", "Key Funds", "Assets Under Management (AUM)", "Primary Address"]
a="Capital Group"
for exec_type, exec_str in zip(execs, exec_strs[:4]):
    system_prompt = f"""
    Find the names of the following executives at {a}:
    1. {exec_strs[0]}
    2. {exec_strs[1]}
    3. {exec_strs[2]}
    4. {exec_strs[3]}
    Only return the names in order.
    """
    response = retriever.retrieve(system_prompt)
    # response = query_engine.query(system_prompt)
    collect.append(response.response)
    # print(str(response))
    # response = llm.complete(system_prompt)
    # response_check = llm.complete(f"Extract and only output the name in the following text.\n\n{response.text}")

In [None]:
!pip3 install duckduckgo-search langchain-community playwright && playwright install && playwright install-deps

In [1]:
import re
from duckduckgo_search import DDGS
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
import nest_asyncio
nest_asyncio.apply()

def get_page(urls):
    loader = AsyncChromiumLoader(urls)
    html = loader.load()

    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["p"], remove_unwanted_tags=["a"])

    return docs_transformed


def truncate(text):
    words = text.split()
    truncated = " ".join(words[:400])

    return truncated

def ddg_search(query):
    results = DDGS().text(query, max_results=5)
    urls = []
    for result in results:
        url = result['href']
        urls.append(url)

    docs = get_page(urls)

    content = []
    for doc in docs:
        page_text = re.sub("\n\n+", "\n", doc.page_content)
        text = truncate(page_text)
        content.append(text)

    return content

In [1]:
a="Capital Group"
system_prompt = f"""
Find the names of the following executives at {a}:
1. {exec_strs[0]}
2. {exec_strs[1]}
3. {exec_strs[2]}
4. {exec_strs[3]}
Only return the names in order.
"""

In [None]:
# results = ddg_search(system_prompt)
results = ddg_search("Who is the CEO at Capital Group?")

In [None]:
print(results[4])

In [None]:
# Set up ollama
!apt-get update && apt-get install tmux vim -y
!pip3 install --ignore-installed llama-index llama-parse llama_deploy llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-llms-ollama llama-index-embeddings-ollama llama-index-vector-stores-neo4jvector llama-index-graph-stores-neo4j llama-index-finetuning llama-index-utils-workflow llama-index-readers-file llama-index-retrievers-you
!pip3 install sentencepiece protobuf evaluate rouge_score absl-py tensorboardX bitsandbytes peft accelerate python-dotenv graspologic fpdf2
!pip3 install flash-attn --no-build-isolation
!curl -fsSL https://ollama.com/install.sh | sh


In [None]:
!pip3 install free-proxy fake-useragent

In [40]:
from bs4 import BeautifulSoup
import urllib, requests, random, time
from concurrent.futures import ThreadPoolExecutor, as_completed
from fp.fp import FreeProxy
from fake_useragent import UserAgent


# def extract_proxies(html):
#     soup = BeautifulSoup(html, 'html.parser')
#     text = soup.get_text()
    
#     # Regular expression to match IP addresses and ports
#     proxy_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]{1,5}\b')
#     proxies = proxy_pattern.findall(text)
    
#     return proxies

# def fetch_proxies():
#     url = "https://free-proxy-list.net/"
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')
#     ips_ports = extract_proxies(soup.text)
#     proxies = []

#     for ip_port in ips_ports:
#         proxy = f"http://{ip_port}"
#         proxies.append(proxy)
    
#     return proxies


class GoogleSearch:
    def __init__(self, query: str) -> None:
        self.query = query
        escaped_query = urllib.parse.quote_plus(query)
        self.URL = f"https://www.google.com/search?q={escaped_query}"

        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36"
        }
        self.ua = UserAgent()
        self.proxy = None
        self.links = self.get_initial_links()
        self.all_page_data = self.all_pages()

    def get_random_user_agent(self):
        self.headers["User-Agent"] = self.ua.random
        return None
    
    def get_random_proxy(self):
        # return FreeProxy(https=True, google=True, timeout=1.0).get(repeat=False) country_id=['US', 'GB', 'CA', 'AU'], 
        return FreeProxy(google=True, https=True, timeout=20.0).get()

    def clean_urls(self, anchors: list[str]) -> list[str]:

        links: list[str] = []
        for a in anchors:
            links.append(
                list(filter(lambda l: l.startswith("url=http"), a["href"].split("&")))
            )

        links = [
            link.split("url=")[-1]
            for sublist in links
            for link in sublist
            if len(link) > 0
        ]

        return links

    def read_url_page(self, url: str) -> str:
        self.get_random_user_agent()
        # response = requests.get(url, headers=self.headers)
        response = requests.get(url, headers=self.headers, proxies={"https": self.proxy, "http": self.proxy})
        print("read page", response.status_code)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.get_text(strip=True)

    def get_initial_links(self) -> list[str]:
        """
        scrape google for the query with keyword based search
        """
        # print("Searching Google...")
        self.proxy = self.get_random_proxy()
        self.get_random_user_agent()
        response = requests.get(self.URL, headers=self.headers, proxies={"https": self.proxy, "http": self.proxy}) #"https": proxy, 
        print("inital links", response.status_code)
        # response = requests.get(self.URL, headers=self.headers)
        # Print the response status and content for debugging
        code429 = 0
        while True:
            if response.status_code in [429, 400]:
                print(f"Got a {response.status_code}. Retrying.")
                retry_after = int(response.headers.get("Retry-After", 4))  # Default to 60 seconds if not provided
                print(f"Rate limited. Retrying after {retry_after} seconds.")
                time.sleep(retry_after)
                proxy = self.get_random_proxy()
                print(proxy)
                self.get_random_user_agent()
                response = requests.get(self.URL, headers=self.headers, proxies={"https": proxy, "http": proxy}) #"https": proxy, 
                code429 += 1
                if code429 > 5:
                    print("Too many 429s. Exiting.", response.status_code)
                    break
            else:
                print("Finally got through", response.status_code)
                break

        soup = BeautifulSoup(response.text, "html.parser")
        anchors = soup.find_all("a", href=True)
        return self.clean_urls(anchors)

    def all_pages(self) -> list[tuple[str, str]]:
        print(self.links)
        data: list[tuple[str, str]] = []
        with ThreadPoolExecutor(max_workers=4) as executor:

            future_to_url = {
                executor.submit(self.read_url_page, url): url for url in self.links[:3]
            }
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    output = future.result()
                    data.append((url, output))

                except requests.exceptions.HTTPError as e:
                    print(e)

        return data

In [4]:
asset_list = ["Vanguard Group",	"BlackRock",	"State Street Global",	"Fidelity Investments",	"BNY Mellon",	"Goldman Sachs Group",	"J.P. Morgan Asset & Wealth",	"Legal & General Investment",	"Wellington Mgmt.",	"Amundi",	"Prudential Financial",	"Geode Capital Mgmt.",	"PIMCO",	"Northern Trust Asset Mgmt.",	"Nuveen",	"Capital Group",	"T. Rowe Price Associates",	"AXA Investment",	"Franklin Templeton",	"Federated Hermes",	"Invescos",	"Dimensional Fund Advisors",	"MetLife Investment Mgmt.",	"Morgan Stanley Inv. Mgmt.",	"New York Life Investments",	"Schroders",	"Principal Global Investors",	"KKR",	"DWS",	"Macquarie Asset Mgmt.",	"Brookfield Asset Mgmt.",	"Allspring Global Investments",	"BNP Paribas Asset Mgmt.",	"Asset Management One",	"Mercer",	"Barings",	"Aegon Asset Mgmt.",	"AllianceBernstein",	"Neuberger Berman",	"Ares Mgmt.",	"Columbia Threadneedle",	"NISA Investment",	"Voya Investment Mgmt.",	"MassMutual",	"Aviva Investors",	"SEI Investments",	"Manulife Investment",	"SLC Management",	"Russell Investments",	"Loomis, Sayles",	"Baillie Gifford Overseas",	"Dodge & Cox",	"TCW Group",	"MFS Investment",	"RBC Global Asset Mgmt.",	"Mesirow",	"Guggenheim Investments",	"Wilmington Trust",	"WTW Investment Services",	"Conning",	"Empower Investments",	"PFM Asset Mgmt.",	"CBRE Investment Mgmt.",	"TD Global Invest. Solutions",	"IFM Investors",	"Arrowstreet Capital",	"Nomura Asset Mgmt.",	"Oaktree Capital",	"Payden & Rygel",	"Lazard Asset Mgmt.",	"Victory Capital",	"Artisan Partners",	"PRIMECAP",	"Man Group",	"Robeco",	"Baird Advisors",	"Hamilton Lane",	"Ninety One",	"Partners Group",	"Starwood Capital",	"AQR Capital Mgmt.",	"Acadian Asset Mgmt.",	"Prologis",	"LSV Asset Mgmt.",	"StepStone Group",	"American Century",	"Charles Schwab Investment",	"RhumbLine Advisers",	"Pathway Capital",	"Boston Partners",	"Record Currency Mgmt.",	"Income Research & Mgmt.",	"AEW Capital",	"Pictet Asset Mgmt.",	"First Sentier Investors",	"Hines",	"New England Asset Mgmt.",	"Alan Biller and Associates",	"PPM America",	"LaSalle Investment",	"GCM Grosvenor",	"PineBridge Investments",	"PNC Financial",	"CC&L Financial Group",	"Fort Washington",	"Dai-ichi Life Holdings",	"Oak Hill Advisors",	"William Blair",	"Putnam Investments",	"Ashmore Group",	"Heitman",	"Harrison Street",	"Grantham, Mayo v. Otterloo",	"PAG",	"Harris Associates",	"Adams Street Partners",	"Sterling Capital",	"GoldenTree Asset Mgmt.",	"Mondrian Investment",	"Angelo, Gordon",	"Nikko Asset Mgmt.",	"Harding Loevner",	"Brown Advisory",	"Portfolio Advisors",	"Fisher Investments",	"Cohen & Steers",	"Marathon-London",	"Harbor Capital Advisors",	"Aristotle Capital Mgmt.",	"SECOR Asset Mgmt.",	"Stockbridge Capital Group",	"PanAgora Asset Mgmt.",	"Pzena Investment",	"Causeway Capital",	"Colchester Global Investors",	"MissionSquare Investments",	"Hayfin Capital Mgmt.",	"ORIX USA",	"CIBC Asset Mgmt.",	"Los Angeles Capital",	"Shenkman Group",	"Jarislowsky Fraser",	"EARNEST Partners",	"Knights of Columbus Asset",	"Strategic Investment Group",	"Commonfund",	"Rockpoint Group",	"Hotchkis & Wiley",	"AAM",	"CIM Group*",	"Beutel, Goodman",	"Nomura Corporate Research",	"Scout Investments",	"Calamos Advisors",	"ACORE Capital",	"PCCP",	"Guardian Capital",	"DuPont Capital",	"Canyon Partners",	"Kayne Anderson Rudnick",	"Polen Capital",	"TA Realty",	"Sustainable Growth Advisers",	"MFG Asset Mgmt.",	"Unigestion",	"Intech",	"Eagle Capital",	"Garcia Hamilton",	"Sprucegrove Investment",	"Longfellow Investment",	"Axiom Investors",	"King Street Capital",	"Wasatch Global Investors",	"Boyd Watterson",	"Champlain Investment",	"Crestline Investors",	"Callan",	"Pacific Asset Mgmt.",	"Fuller & Thaler",	"Cantillon Capital Mgmt.",	"Jacobs Levy Equity",	"Brandes Investment",	"Fayez Sarofim",	"Sit Investment",	"Cliffwater",	"Intercontinental Real Estate",	"Walton Street Capital",	"Beacon Capital",	"Rockwood Capital",	"Breckinridge Capital",	"Beach Point Capital",	"Amalgamated Bank",	"American Realty Advisors",	"Abbott Capital",	"Eagle Asset Mgmt.",	"Westfield Capital",	"Driehaus Capital",	"CenterSquare Investment",	"Segall Bryant & Hamill",	"Polaris Capital",	"Grayscale Investments",	"GW&K Investment",	"CornerStone Partners",	"Westbrook Partners",	"Bahl & Gaynor",	"Sage Advisory Services",	"Yousif Capital",	"Commerce Trust",	"Zacks Investment",	"Stone Harbor Investment",	"Brown Capital",	"Cooke & Bieler",	"L&B Realty",	"Jensen Investment",	"Burgundy Asset Mgmt.",	"Pugh Capital",	"Mill Creek Residential",	"Global Endowment Mgmt.",	"Ullico Investment",	"London Co.",	"GAMCO Investors",	"Matthews Asia",	"Capital Fund Mgmt.",	"Waterfall Asset Mgmt.",	"Westwood Global",	"Frontier Capital",	"Christian Brothers",	"Manning & Napier",	"Ariel Investments",	"Washington Capital",	"TimesSquare Capital",	"Ramirez Asset Mgmt.",	"Altrinsic Global Advisors",	"National Real Estate",	"Advent Capital",	"CS McKee",	"WEDGE Capital",	"Newfleet Asset Mgmt.",	"National Investment",	"TT International",	"Prima Capital Advisors",	"Alger",	"Silver Creek Capital",	"River Road Asset Mgmt.",	"Agincourt Capital",	"Diamond Hill Capital",	"AGF Investments",	"Sentinel Real Estate",	"Ceredex Value Advisors",	"CoreCommodity",	"LCM Partners",	"Madison Realty",	"Silvercrest Asset Mgmt.",	"White Oak Global Advisors",	"Luther King Capital",	"Equus Capital",	"Hardman Johnston Global",	"AFL-CIO Housing Trust",	"Corbin Capital",	"City of London",	"Spider Mgmt.",	"M3 Capital",	"Davis Advisors",	"Torchlight Investors",	"Stephens Inv. Mgmt. Group",	"Great Lakes Advisors",	"Congress Asset Mgmt.",	"Parnassus Investments",	"Dana Investment",	"Martingale Asset Mgmt.",	"Madison Investments",	"Richmond Capital",	"Camden Asset Mgmt.",	"400 Capital Mgmt.",	"Glenmede Investment",	"Lyrical Asset Mgmt.",	"Gramercy",	"D.F. Dent",	"Resource Mgmt.",	"DePrince, Race & Zollo",	"Fiduciary Mgmt./Milwaukee",	"Duff & Phelps",	"AFL-CIO Building Trust",	"Johnson Asset Mgmt.",	"LM Capital Group",	"Conestoga Capital",	"Sierra Investment",	"Baird Equity Asset Mgmt.",	"Forest Investment",	"Carmel Partners",	"Atalanta Sosnoff Capital",	"Jackson Square Partners",	"Peregrine Capital",	"Todd Asset Mgmt.",	"Hoisington Investment",	"GlobeFlex Capital",	"Kornitzer Capital",	"Patron Capital",	"Emerald Advisers",	"Mar Vista Investment",	"Stacey Braun Associates",	"Leading Edge Investment",	"Kennedy Capital",	"Security Capital Research",	"Riverbridge Partners",	"Cardinal Capital",	"Granahan Investment",	"Dolan McEniry",	"Angel Oak Capital",	"Global Forest Partners",	"Channing Capital",	"ClariVest Asset Mgmt.",	"Twin Bridge Capital",	"Evanston Capital",	"Aristotle Capital Boston",	"Systematic Financial",	"Palisade Capital",	"Algert Global",	"Hillsdale Investment",	"Prime Group",	"3650 REIT",	"GTIS Partners",	"Bivium Capital",	"Molpus Woodlands Group",	"Winthrop Capital",	"Zevenbergen Capital",	"Dalton Investments",	"Hood River Capital",	"Trillium Asset Mgmt.",	"Bowen, Hanes",	"EAM Investors",	"Verger Capital",	"GIA Partners",	"Weatherbie Capital",	"Oberweis Asset Mgmt.",	"Zazove Associates",	"Wexford Capital",	"Timberland Inv. Resources",	"Sawgrass Asset Mgmt.",	"SSI Investment",	"NewSouth Capital",	"Foundry Partners",	"Sound Shore Mgmt.",	"KBS",	"HS Management",	"Silvant Capital",	"Millstreet Capital",	"Redwood Investments",	"Genter Capital",	"Smith Group Asset Mgmt.",	"Sarofim Realty",	"Edgar Lomax",	"Covenant Capital Group",	"Heartland Advisors",	"Adelante Capital",	"Rice Hall James",	"Cramer Rosenthal McGlynn",	"Quest Investment",	"Penn Capital",	"Ranger Investments",	"Associated Capital Group",	"Cornerstone Investment",	"Smith Affiliated Capital",	"Logan Capital",	"Wilbanks, Smith & Thomas",	"Orleans Capital",	"Thornburg Investment",	"Karpus Investment",	"StoneRidge PMG Advisors",	"Tributary Capital",	"Mairs & Power",	"Bridgeway Capital",	"Granite Investment",	"Ativo Capital Mgmt.",	"Nicholas Investment",	"Sasco Capital",	"CS Capital",	"TWIN Capital",	"CI Inverness",	"Miller/Howard Investments",	"Belle Haven Investments",	"Montag & Caldwell",	"Anchor Capital",	"Wedgewood Partners",	"Wright Investors' Service",	"Phocas Financial",	"TSCG Investors",	"Pier Capital",	"GLOBALT",	"Van Hulzen Asset Mgmt.",	"SKBA Capital Mgmt.",	"Domain Timber Advisors",	"Speece Thorson Capital",	"Redstone Advisors",	"Aristotle Credit Partners",	"TerraCotta Group",	"Farr, Miller & Washington",	"SouthernSun Asset Mgmt.",	"Gifford Fong Associates",	"Denali Advisors",	"KDP Asset Mgmt.",	"AMI Asset Mgmt.",	"Semper Capital",	"Renaissance Investment",	"ZWJ Investment Counsel",	"Campbell Newman Asset",	"Gateway Investment",	"SMH Capital Advisors",	"Argent Capital",	"Chicago Capital",	"Osborne Partners",	"Oak Associates",	"Windham Capital",	"Bridge City",	"Strategy Asset Managers",	"Kingdon Capital",	"Glovista Investments",	"Winslow Asset Mgmt.",	"Hahn Capital",	"Affinity Investment Advisors",	"Teton Advisors",	"Abner, Herrman & Brock",	"NovaPoint Capital",	"Paradigm Capital",	"Flippin, Bruce & Porter",	"Kestrel Investment",	"Tom Johnson Investment",	"Argus Investors' Counsel",	"Branson, Fowlkes/Russell",	"Robinson Value Mgmt.",	"Chase Investment Counsel",	"Nicholas Co.",	"Cadinha",	"Pacific West Land",]
import os, re
from dotenv import load_dotenv
load_dotenv('/workspace/repos/agentic-ai/.env')
from llama_index.llms.openai import OpenAI as LOpenAI
# from llama_index.retrievers.you import YouRetriever
# from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

# model_name, ctx_len = "gpt-4o-2024-08-06", 128000
# you_api_key = os.environ["YDC_API_KEY"]

# openai_key = os.getenv("OPENAI_API_KEY")
# os.environ["OPENAI_API_KEY"] = openai_key

model_name, ctx_len = "llama3.1:8b-instruct-q8_0", 128000
addtion_kwargs = {"max_new_tokens": 8000, 'temperature': 0.8}
llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
                request_timeout=4000.0, **addtion_kwargs) #, system_prompt=system_prompt) additional_kwargs=addtion_kwargs,
# llm = LOpenAI(model=model_name, max_tokens=8000)
# Settings.llm = llm

# retriever = YouRetriever(api_key=you_api_key, endpoint="search", num_web_results=3)
# query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

ceos=[]
coos=[]
cios=[]
presidents=[]
execs = [ceos, coos, cios, presidents]

In [None]:

def get_first_page_context(search_prompt):
    results = GoogleSearch(search_prompt)
    page_data = results.all_page_data
    context = ""
    for page in page_data:
        context += page[1]
        break
    return context, page_data

results = []
exec_strs = ["Chief Executive Officer (CEO)", "Chief Operating Officer (COO)", "Chief Investment Officer (CIO)", "President", "10-Year Performance", "Key Funds", "Assets Under Management (AUM)", "Primary Address"]


for i,a in enumerate(asset_list):
    a = "Vanguard"
    company_execs=[a]
    for exec_type, exec_str in zip(execs, exec_strs[:4]):
        search_prompt = f"""Who is the {exec_str} of {a}?"""
        context = get_first_page_context(search_prompt)
        answer_prompt = f"""Given the following text, extract the name of the {exec_str}.
                            Only output the name.\n\n{context}"""
        response = llm.complete(answer_prompt)
        company_execs.append(response.text)

    return_prompt=f"""What is the 10-year annualized return at {a}?"""
    context = get_first_page_context(return_prompt)
    answer_prompt = f"""Given the following text, extract the 10-year annualized return.
                        Only output the return percent.\n\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)
    funds_prompt = f"""What are the most popular funds at {a}?"""
    context = get_first_page_context(funds_prompt)
    answer_prompt = f"""Given the following text, extract the fund names.
                        Only output a list of the fund names.\n\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)
    aum_prompt = f"""What is the total Assets Under Management (AUM) in USD at {a}?"""
    context = get_first_page_context(aum_prompt)
    answer_prompt = f"""Given the following text, extract the AUM.
                        Only output the AUM in USD.\n\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)
    address_prompt = f"""What is the primary address at {a}?"""
    context = get_first_page_context(address_prompt)
    answer_prompt = f"""Given the following text, extract the primary address.
                        Only output the address.\n\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)
    break
    # if i%25==0:
    #     print(f"Processed {i+1}/{len(asset_list)} companies...")
    # results.append(company_execs)

In [None]:
exec_strs = ["Chief Executive Officer (CEO)", "Chief Operating Officer (COO)", "Chief Investment Officer (CIO)", "President", "10-Year Performance", "Key Funds", "Assets Under Management (AUM)", "Primary Address"]

def get_first_page_context(search_prompt):
    results = GoogleSearch(search_prompt)
    page_data = results.all_page_data
    context = ""
    for page in page_data:
        context += page[1]
        break
    return context, page_data

company_execs=[]
a = "Vanguard"
funds_prompt = f"""What are the most popular funds at {a}?"""
context, page_data = get_first_page_context(funds_prompt)
answer_prompt = f"""Given the following text, extract the fund names.
                    Only output a list of the fund names.\n\n{context}"""
company_execs.append(llm.complete(answer_prompt).text)

In [1]:
import requests
url = "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&proxy_format=protocolipport&format=text"  
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36'}
response = requests.get(url, headers=header)

In [None]:
# {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'}
# http://50.207.199.81:80

In [None]:
header = ["Company Name"]+exec_strs
for h,c in zip(header, company_execs):
    print(f"{h}: {c}")

In [28]:
page_data = results.all_page_data

In [None]:
len(page_data)

In [None]:
print(page_data[0][1][:2000]+page_data[1][1][:2000]+page_data[2][1][:2000]+page_data[3][1][:2000])

In [48]:
context = ""
for page in page_data:
    context += page[1][:2000]
# context = page_data[0][1][:1000]+page_data[1][1][:1000]+page_data[2][1][:1000]
response = llm.complete(f"""Given the following text, extract the name of the CEO.
                         Only output the name.\n\n{context}""")

In [None]:
print(response)

In [None]:

header=["Company Name"]+exec_strs
bads=0
for r in results:
    for elem in r:
        if len(elem.split())>4:
            bads+=1
            print(" ".join(elem.split()[-3:]))
            break
print(bads)
results=[r[0] for r in results]
for i in range(len(results)):
    for j in range(1,5):
        # results[i][j] = results[i][j].strip().replace("'", "")
        # if results[i][j].endswith("."):
        #     results[i][j] = results[i][j][:-1]
        # if results[i][j].endswith(","):
        #     results[i][j] = results[i][j][:-1]
        # if " ".join(results[i][j].split()[-3:]).startswith("is"):
        #     results[i][j] = " ".join(results[i][j].split()[-3:])[2:].strip()
        # if " ".join(results[i][j].split()[-3:]).startswith(("is", "have", "and", "a different", "my")):
        #     results[i][j] = " ".join(results[i][j].split()[-3:])[2:].strip()
        if len(results[i][j].split())>4:
            results[i][j] = " ".join(results[i][j].split()[-3:])[2:].replace('"', '').strip()



In [None]:
import csv

# Sample list of lists


# Specify the file name
filename = '/workspace/data/asset_execs2.csv'
header = ["Company Name"]+exec_strs
# Open the file in write mode
with open(filename, 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write each row to the CSV file
    for row in [header]+results:
        writer.writerow(row)

print(f"Data has been written to {filename}")

In [None]:
results[3]

In [None]:
results[:10]

In [None]:
asset_list = ["Vanguard Group",	"BlackRock",	"State Street Global",	"Fidelity Investments",	"BNY Mellon",	"Goldman Sachs Group",	"J.P. Morgan Asset & Wealth",	"Legal & General Investment",	"Wellington Mgmt.",	"Amundi",	"Prudential Financial",	"Geode Capital Mgmt.",	"PIMCO",	"Northern Trust Asset Mgmt.",	"Nuveen",	"Capital Group",	"T. Rowe Price Associates",	"AXA Investment",	"Franklin Templeton",	"Federated Hermes",	"Invescos",	"Dimensional Fund Advisors",	"MetLife Investment Mgmt.",	"Morgan Stanley Inv. Mgmt.",	"New York Life Investments",	"Schroders",	"Principal Global Investors",	"KKR",	"DWS",	"Macquarie Asset Mgmt.",	"Brookfield Asset Mgmt.",	"Allspring Global Investments",	"BNP Paribas Asset Mgmt.",	"Asset Management One",	"Mercer",	"Barings",	"Aegon Asset Mgmt.",	"AllianceBernstein",	"Neuberger Berman",	"Ares Mgmt.",	"Columbia Threadneedle",	"NISA Investment",	"Voya Investment Mgmt.",	"MassMutual",	"Aviva Investors",	"SEI Investments",	"Manulife Investment",	"SLC Management",	"Russell Investments",	"Loomis, Sayles",	"Baillie Gifford Overseas",	"Dodge & Cox",	"TCW Group",	"MFS Investment",	"RBC Global Asset Mgmt.",	"Mesirow",	"Guggenheim Investments",	"Wilmington Trust",	"WTW Investment Services",	"Conning",	"Empower Investments",	"PFM Asset Mgmt.",	"CBRE Investment Mgmt.",	"TD Global Invest. Solutions",	"IFM Investors",	"Arrowstreet Capital",	"Nomura Asset Mgmt.",	"Oaktree Capital",	"Payden & Rygel",	"Lazard Asset Mgmt.",	"Victory Capital",	"Artisan Partners",	"PRIMECAP",	"Man Group",	"Robeco",	"Baird Advisors",	"Hamilton Lane",	"Ninety One",	"Partners Group",	"Starwood Capital",	"AQR Capital Mgmt.",	"Acadian Asset Mgmt.",	"Prologis",	"LSV Asset Mgmt.",	"StepStone Group",	"American Century",	"Charles Schwab Investment",	"RhumbLine Advisers",	"Pathway Capital",	"Boston Partners",	"Record Currency Mgmt.",	"Income Research & Mgmt.",	"AEW Capital",	"Pictet Asset Mgmt.",	"First Sentier Investors",	"Hines",	"New England Asset Mgmt.",	"Alan Biller and Associates",	"PPM America",	"LaSalle Investment",	"GCM Grosvenor",	"PineBridge Investments",	"PNC Financial",	"CC&L Financial Group",	"Fort Washington",	"Dai-ichi Life Holdings",	"Oak Hill Advisors",	"William Blair",	"Putnam Investments",	"Ashmore Group",	"Heitman",	"Harrison Street",	"Grantham, Mayo v. Otterloo",	"PAG",	"Harris Associates",	"Adams Street Partners",	"Sterling Capital",	"GoldenTree Asset Mgmt.",	"Mondrian Investment",	"Angelo, Gordon",	"Nikko Asset Mgmt.",	"Harding Loevner",	"Brown Advisory",	"Portfolio Advisors",	"Fisher Investments",	"Cohen & Steers",	"Marathon-London",	"Harbor Capital Advisors",	"Aristotle Capital Mgmt.",	"SECOR Asset Mgmt.",	"Stockbridge Capital Group",	"PanAgora Asset Mgmt.",	"Pzena Investment",	"Causeway Capital",	"Colchester Global Investors",	"MissionSquare Investments",	"Hayfin Capital Mgmt.",	"ORIX USA",	"CIBC Asset Mgmt.",	"Los Angeles Capital",	"Shenkman Group",	"Jarislowsky Fraser",	"EARNEST Partners",	"Knights of Columbus Asset",	"Strategic Investment Group",	"Commonfund",	"Rockpoint Group",	"Hotchkis & Wiley",	"AAM",	"CIM Group*",	"Beutel, Goodman",	"Nomura Corporate Research",	"Scout Investments",	"Calamos Advisors",	"ACORE Capital",	"PCCP",	"Guardian Capital",	"DuPont Capital",	"Canyon Partners",	"Kayne Anderson Rudnick",	"Polen Capital",	"TA Realty",	"Sustainable Growth Advisers",	"MFG Asset Mgmt.",	"Unigestion",	"Intech",	"Eagle Capital",	"Garcia Hamilton",	"Sprucegrove Investment",	"Longfellow Investment",	"Axiom Investors",	"King Street Capital",	"Wasatch Global Investors",	"Boyd Watterson",	"Champlain Investment",	"Crestline Investors",	"Callan",	"Pacific Asset Mgmt.",	"Fuller & Thaler",	"Cantillon Capital Mgmt.",	"Jacobs Levy Equity",	"Brandes Investment",	"Fayez Sarofim",	"Sit Investment",	"Cliffwater",	"Intercontinental Real Estate",	"Walton Street Capital",	"Beacon Capital",	"Rockwood Capital",	"Breckinridge Capital",	"Beach Point Capital",	"Amalgamated Bank",	"American Realty Advisors",	"Abbott Capital",	"Eagle Asset Mgmt.",	"Westfield Capital",	"Driehaus Capital",	"CenterSquare Investment",	"Segall Bryant & Hamill",	"Polaris Capital",	"Grayscale Investments",	"GW&K Investment",	"CornerStone Partners",	"Westbrook Partners",	"Bahl & Gaynor",	"Sage Advisory Services",	"Yousif Capital",	"Commerce Trust",	"Zacks Investment",	"Stone Harbor Investment",	"Brown Capital",	"Cooke & Bieler",	"L&B Realty",	"Jensen Investment",	"Burgundy Asset Mgmt.",	"Pugh Capital",	"Mill Creek Residential",	"Global Endowment Mgmt.",	"Ullico Investment",	"London Co.",	"GAMCO Investors",	"Matthews Asia",	"Capital Fund Mgmt.",	"Waterfall Asset Mgmt.",	"Westwood Global",	"Frontier Capital",	"Christian Brothers",	"Manning & Napier",	"Ariel Investments",	"Washington Capital",	"TimesSquare Capital",	"Ramirez Asset Mgmt.",	"Altrinsic Global Advisors",	"National Real Estate",	"Advent Capital",	"CS McKee",	"WEDGE Capital",	"Newfleet Asset Mgmt.",	"National Investment",	"TT International",	"Prima Capital Advisors",	"Alger",	"Silver Creek Capital",	"River Road Asset Mgmt.",	"Agincourt Capital",	"Diamond Hill Capital",	"AGF Investments",	"Sentinel Real Estate",	"Ceredex Value Advisors",	"CoreCommodity",	"LCM Partners",	"Madison Realty",	"Silvercrest Asset Mgmt.",	"White Oak Global Advisors",	"Luther King Capital",	"Equus Capital",	"Hardman Johnston Global",	"AFL-CIO Housing Trust",	"Corbin Capital",	"City of London",	"Spider Mgmt.",	"M3 Capital",	"Davis Advisors",	"Torchlight Investors",	"Stephens Inv. Mgmt. Group",	"Great Lakes Advisors",	"Congress Asset Mgmt.",	"Parnassus Investments",	"Dana Investment",	"Martingale Asset Mgmt.",	"Madison Investments",	"Richmond Capital",	"Camden Asset Mgmt.",	"400 Capital Mgmt.",	"Glenmede Investment",	"Lyrical Asset Mgmt.",	"Gramercy",	"D.F. Dent",	"Resource Mgmt.",	"DePrince, Race & Zollo",	"Fiduciary Mgmt./Milwaukee",	"Duff & Phelps",	"AFL-CIO Building Trust",	"Johnson Asset Mgmt.",	"LM Capital Group",	"Conestoga Capital",	"Sierra Investment",	"Baird Equity Asset Mgmt.",	"Forest Investment",	"Carmel Partners",	"Atalanta Sosnoff Capital",	"Jackson Square Partners",	"Peregrine Capital",	"Todd Asset Mgmt.",	"Hoisington Investment",	"GlobeFlex Capital",	"Kornitzer Capital",	"Patron Capital",	"Emerald Advisers",	"Mar Vista Investment",	"Stacey Braun Associates",	"Leading Edge Investment",	"Kennedy Capital",	"Security Capital Research",	"Riverbridge Partners",	"Cardinal Capital",	"Granahan Investment",	"Dolan McEniry",	"Angel Oak Capital",	"Global Forest Partners",	"Channing Capital",	"ClariVest Asset Mgmt.",	"Twin Bridge Capital",	"Evanston Capital",	"Aristotle Capital Boston",	"Systematic Financial",	"Palisade Capital",	"Algert Global",	"Hillsdale Investment",	"Prime Group",	"3650 REIT",	"GTIS Partners",	"Bivium Capital",	"Molpus Woodlands Group",	"Winthrop Capital",	"Zevenbergen Capital",	"Dalton Investments",	"Hood River Capital",	"Trillium Asset Mgmt.",	"Bowen, Hanes",	"EAM Investors",	"Verger Capital",	"GIA Partners",	"Weatherbie Capital",	"Oberweis Asset Mgmt.",	"Zazove Associates",	"Wexford Capital",	"Timberland Inv. Resources",	"Sawgrass Asset Mgmt.",	"SSI Investment",	"NewSouth Capital",	"Foundry Partners",	"Sound Shore Mgmt.",	"KBS",	"HS Management",	"Silvant Capital",	"Millstreet Capital",	"Redwood Investments",	"Genter Capital",	"Smith Group Asset Mgmt.",	"Sarofim Realty",	"Edgar Lomax",	"Covenant Capital Group",	"Heartland Advisors",	"Adelante Capital",	"Rice Hall James",	"Cramer Rosenthal McGlynn",	"Quest Investment",	"Penn Capital",	"Ranger Investments",	"Associated Capital Group",	"Cornerstone Investment",	"Smith Affiliated Capital",	"Logan Capital",	"Wilbanks, Smith & Thomas",	"Orleans Capital",	"Thornburg Investment",	"Karpus Investment",	"StoneRidge PMG Advisors",	"Tributary Capital",	"Mairs & Power",	"Bridgeway Capital",	"Granite Investment",	"Ativo Capital Mgmt.",	"Nicholas Investment",	"Sasco Capital",	"CS Capital",	"TWIN Capital",	"CI Inverness",	"Miller/Howard Investments",	"Belle Haven Investments",	"Montag & Caldwell",	"Anchor Capital",	"Wedgewood Partners",	"Wright Investors' Service",	"Phocas Financial",	"TSCG Investors",	"Pier Capital",	"GLOBALT",	"Van Hulzen Asset Mgmt.",	"SKBA Capital Mgmt.",	"Domain Timber Advisors",	"Speece Thorson Capital",	"Redstone Advisors",	"Aristotle Credit Partners",	"TerraCotta Group",	"Farr, Miller & Washington",	"SouthernSun Asset Mgmt.",	"Gifford Fong Associates",	"Denali Advisors",	"KDP Asset Mgmt.",	"AMI Asset Mgmt.",	"Semper Capital",	"Renaissance Investment",	"ZWJ Investment Counsel",	"Campbell Newman Asset",	"Gateway Investment",	"SMH Capital Advisors",	"Argent Capital",	"Chicago Capital",	"Osborne Partners",	"Oak Associates",	"Windham Capital",	"Bridge City",	"Strategy Asset Managers",	"Kingdon Capital",	"Glovista Investments",	"Winslow Asset Mgmt.",	"Hahn Capital",	"Affinity Investment Advisors",	"Teton Advisors",	"Abner, Herrman & Brock",	"NovaPoint Capital",	"Paradigm Capital",	"Flippin, Bruce & Porter",	"Kestrel Investment",	"Tom Johnson Investment",	"Argus Investors' Counsel",	"Branson, Fowlkes/Russell",	"Robinson Value Mgmt.",	"Chase Investment Counsel",	"Nicholas Co.",	"Cadinha",	"Pacific West Land",]
import os, re, subprocess
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI as LOpenAI
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
load_dotenv('/workspace/repos/agentic-ai/.env')

model_name, ctx_len = "llama3.1:8b-instruct-q8_0", 128000
try:
    # Check if the model is present
    result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
    output = result.stdout
    if model_name not in output:
        subprocess.run(['ollama', 'pull', model_name], check=True)
        print(f"Model {model_name} pulled successfully.")
    else:
        print(f"Model {model_name} is already present.")
except subprocess.CalledProcessError as e:
    print(f"An error occurred: {e}")
addtion_kwargs = {"max_new_tokens": 2000}#, 'temperature': 0.8}
# system_prompt = "Your task is to extract relevant information from the provided text based on the given question. Keep your answers brief and to the point. Do not include calculations, explanations, or any information not explicitly stated in the text."
llm = Ollama(model=model_name, url="http://127.0.0.1:11434", context_window=ctx_len, model_type="chat", is_function_calling_model=False, 
             request_timeout=4000.0, **addtion_kwargs) #, system_prompt=system_prompt) additional_kwargs=addtion_kwargs,
print(llm.metadata)

# openai_key = os.getenv("OPENAI_API_KEY")
# os.environ["OPENAI_API_KEY"] = openai_key
# llm = LOpenAI(model=model_name, max_tokens=8000)
# Settings.llm = llm

# retriever = YouRetriever(api_key=you_api_key, endpoint="search", num_web_results=3)
# query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm)

ceos=[]
coos=[]
cios=[]
presidents=[]
execs = [ceos, coos, cios, presidents]

In [None]:
from crawl4ai import AsyncWebCrawler
import nest_asyncio
import asyncio
from fake_useragent import UserAgent
# https://pypi.org/project/googlesearch-python/
from googlesearch import search
import csv
import random

nest_asyncio.apply()

verbose=False

async def main(urls):
    content = []
    ua = UserAgent()
    async with AsyncWebCrawler(verbose=verbose) as crawler:
        for url in urls:
            result = await crawler.arun(url=url, word_count_threshold=2000, user_agent=ua.random, verbose=verbose)
            content.append(result)
    return content

if __name__ == "__main__":
    content = asyncio.run(main())


def get_url_info(queries, num_results=3, num_words=2000, description_only=True):
    urls = []
    if isinstance(queries, str):
        queries = [queries]
    for query in queries:
        for url in search(query, 
                          sleep_interval=random.randint(2, 5), 
                          num_results=num_results,
                          timeout=1000,
                          advanced=True):
            if description_only:
                urls.append(url.description)
            else:
                urls.append(url.url)

    context = ""

    if description_only:
        for con in urls:
            if con is not None:
                context += "\n" + con
            else:
                continue
    else:
        content = asyncio.run(main(urls))
        for con in content:
            if con.markdown is not None:
                truncated_content = " ".join(con.markdown.split()[:num_words])
                context += "\n" + truncated_content
            else:
                continue

    return context

def make_csv(results, filename, header=None, verbose=True):
    # Check if the file exists
    file_exists = os.path.isfile(filename)

    # Open the file in append mode if it exists, otherwise write mode
    with open(filename, 'a' if file_exists else 'w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header only if the file does not exist
        if not file_exists and header is not None:
            writer.writerow(header)
        
        # Write each row to the CSV file
        for row in results:
            writer.writerow(row)

    if verbose: 
        print(f"Data has been {'appended to' if file_exists else 'written to'} {filename}")
    return None

In [6]:
string1="""Here is an outline on how to write a letter of recommendation for graduate school:

**I. Introduction**

* Begin with a formal greeting and address the recommender's title (e.g. Dr./Professor, Dean)
* Clearly state your relationship with the student (e.g. academic advisor, research mentor, colleague)

**II. Overview of the Student's Qualities and Abilities**

* Summarize the student's academic achievements, research experience, or relevant skills
* Highlight their strengths, such as:
	+ Academic ability and potential for success in graduate school
	+ Research skills and contributions to projects or teams
	+ Leadership, teamwork, or communication abilities
	+ Adaptability, resilience, or other relevant personal qualities

**III. Specific Examples and Stories**

* Provide 2-3 specific examples that illustrate the student's skills, abilities, or achievements
* Use anecdotes or case studies to demonstrate the student's:
	+ Critical thinking and problem-solving abilities
	+ Ability to work independently or as part of a team
	+ Capacity for original thought and innovation
	+ Communication skills, including writing, speaking, or presenting

**IV. Why the Student is a Good Fit for Graduate School**

* Explain why you believe the student is well-suited for graduate study in their chosen field
* Highlight their interests, goals, and motivations for pursuing graduate education
* Emphasize how their research experiences, skills, or personal qualities make them a strong candidate for admission

**V. Conclusion**

* Reiterate your support for the student's application
* Close with a positive statement about the student's potential and future success in graduate school

**VI. Final Details**

* Include any additional information that may be relevant to the application (e.g. letters of recommendation from previous supervisors or colleagues)
* Proofread carefully to ensure accuracy, clarity, and professionalism

**VII. Tips for Writers**

* Be specific and provide concrete examples to support your recommendations
* Use a clear, concise writing style and avoid jargon or technical terms that may be unfamiliar to non-experts
* Keep the letter focused on the student's abilities and potential, rather than their personal characteristics or demographics

**VIII. Final Checklist**

* Ensure you have:
	+ A strong relationship with the student (ideally 1-2 years)
	+ First-hand knowledge of the student's academic and research experiences
	+ A clear understanding of the graduate program and its requirements
	+ Time to write a thorough, well-researched letter
"""


string2 = """Here is a general outline on how to write a letter of recommendation for graduate school:

**I. Introduction**

* Begin by stating the purpose of the letter and the context in which you're writing it
* Mention your relationship with the applicant (e.g., professor, advisor, colleague)
* Provide a brief overview of the applicant's background and qualifications

**II. Academic Ability and Potential**

* Describe the applicant's academic abilities and strengths
* Discuss their potential for success in graduate school
* Highlight any notable achievements or accomplishments they've made during their undergraduate studies

**III. Relevant Skills and Experiences**

* Outline the applicant's relevant skills and experiences that make them a strong candidate for graduate study
* Discuss how these skills and experiences have prepared them for the challenges of graduate school
* Emphasize their ability to apply theoretical knowledge in practical settings

**IV. Personal Qualities and Characteristics**

* Describe the applicant's personal qualities and characteristics that make them a good fit for graduate school
* Discuss their motivation, work ethic, and commitment to their field of study
* Highlight any notable personal achievements or awards they've received

**V. Graduate School Potential**

* Explain why you think the applicant is well-suited for graduate study in their chosen field
* Describe the specific graduate program they're applying to and how it aligns with their goals and interests
* Mention any relevant research opportunities, internships, or other experiences that would enhance their education

**VI. Conclusion**

* Summarize your overall impression of the applicant's qualifications and potential
* Reiterate why you think they are a strong candidate for graduate school
* Close by expressing your confidence in the applicant's ability to succeed at the next level of academic study.

**VII. Additional Tips**

* Use specific examples and anecdotes to illustrate the applicant's skills and abilities
* Avoid generic or formulaic language, and try to convey your unique perspective on the applicant's strengths and potential
* Proofread carefully to ensure that the letter is free of errors in grammar, punctuation, and spelling.

**VIII. Formatting and Content Guidelines**

* Typically, letters of recommendation should be 1-2 pages in length
* Use a formal business or academic tone throughout the letter
* Include your contact information (name, title, department, institution) at the top of the page

Remember to tailor your letter to the specific graduate program and requirements, and to provide as much detail as possible about the applicant's qualifications and potential. Good luck!
"""



In [None]:
import pandas as pd
csv_file = '/workspace/data/asset_execs_ours.csv'
data = pd.read_csv(csv_file)
data

In [None]:
data.shape

In [None]:
data.duplicated().sum()

In [13]:
data.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
data.shape

In [24]:
data.to_csv(csv_file, index=False)

In [16]:
leftovers = list(set(asset_list) - set(data['Company Name'].values.tolist()))

In [None]:
asset_list == data['Company Name'].values.tolist()

In [None]:
data

In [21]:
import pandas as pd

# Assuming data is your DataFrame and asset_list is your list
data['Company Name'] = pd.Categorical(data['Company Name'], categories=asset_list, ordered=True)
data = data.sort_values('Company Name')

In [None]:
data

In [None]:
data['Company Name'].values.tolist()==asset_list

In [None]:
asset_list.index("Todd Asset Mgmt.")

In [None]:
import time

exec_strs = ["Chief Executive Officer (CEO)", "Chief Operations Officer (COO)", "Chief Investment Officer (CIO)", "President", "10-Year Performance", "Key Funds", "Assets Under Management (AUM)", "Primary Address"]
filename = '/workspace/data/asset_execs_ours.csv'
header = ["Company Name"]+exec_strs

start_date = "2024-01-01"
# end_date = "2024-10-08"
query_qualifier = f"after:{start_date}"
# query_qualifier = f"after:{start_date} before:{end_date}"
results = []
context_results = []
intermittent_save = []
start_time = time.time()
for i,a in enumerate(leftovers):
    item_start_time = time.time()
    contexts = []
    company_execs=[a]
    a_qual = f"the asset managemet company {a}"
    for exec_type, exec_str in zip(execs, exec_strs[:4]):

        # Construct the custom search query with date range
        query = f"""From {a_qual}'s executive team, who is the {exec_str} at {a_qual}?"""
        custom_query = f"{query} {query_qualifier}"
        context = get_url_info(custom_query, num_results=3)
        contexts.append(a + " " + exec_str + "\n" + context)
        # response = llm.complete(f"""Given the following text, extract the name of the {exec_str} at {a_qual}.
        response = llm.complete(f"""Given the following text, what is the name of the {exec_str} at {a_qual}?
                                    Only output the name.\n\nText:\n{context}""")
                                    
        company_execs.append(response.text)
    
    num_results=5
    num_words=2000
    return_prompt=f"""What is the 10-year return for {a_qual}?"""
    return_query = f"{return_prompt} {query_qualifier}"
    context = get_url_info(return_query, num_results=num_results, num_words=num_words)
    contexts.append(a + " " + return_query + "\n" + context)
    # answer_prompt = f"""Given the following text, extract the 10-year return for {a_qual}.
    answer_prompt = f"""Given the following text, what is the 10-year return for {a_qual}?
                        Only output the return.\n\nText:\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)

    funds_prompt = f"""What are the key funds for {a_qual}?"""
    funds_query = f"{funds_prompt} {query_qualifier}"
    context = get_url_info(funds_query, num_results=3, num_words=num_words, description_only=False)
    contexts.append(a + " " + funds_query + "\n" + context)
    # answer_prompt = f"""Given the following text, extract the names of the funds for {a_qual}.
    answer_prompt = f"""Given the following text, what are the names of the funds for {a_qual}?
                        Only output the names.\n\nText:\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)

    aum_prompt = f"""What is the total Assets Under Management (AUM) in USD for {a_qual}?"""
    aum_query = f"{aum_prompt} {query_qualifier}"
    context = get_url_info(aum_query, num_results=num_results, num_words=1000)
    contexts.append(a + " " + aum_query + "\n" + context)
    # answer_prompt = f"""Given the following text, extract the total Assets Under Management (AUM) for {a_qual}.
    answer_prompt = f"""Given the following text, what is the current total Assets Under Management (AUM) at {a_qual}?
                        Only output the total AUM.\n\nText:\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)

    address_prompt = f"""What is the primary address for {a_qual}?"""
    address_query = f"{address_prompt} {query_qualifier}"
    context = get_url_info(address_query, num_results=num_results, num_words=500)
    contexts.append(a + " " + address_query + "\n" + context)
    # answer_prompt = f"""Given the following text, extract the primary address for {a_qual}.
    answer_prompt = f"""Given the following text, what is the primary address for {a_qual}?
                        Only output the address.\n\nText:\n{context}"""
    company_execs.append(llm.complete(answer_prompt).text)
    intermittent_save.append(company_execs)
    
    # Print the estimated remaining time at regular intervals
    if i % 14 == 0 or i == len(asset_list) - 1:
        # Periodically save results
        header = ["Company Name"]+exec_strs if os.path.exists(filename) else None
        make_csv(intermittent_save, filename, header=None, verbose=True)
        intermittent_save = []
        # Record the time after processing the current item
        item_end_time = time.time()
        # Calculate the average processing time per item
        average_processing_time = (item_end_time - start_time) / (i + 1)
        # Estimate the remaining time
        remaining_items = len(asset_list) - (i + 1)
        estimated_remaining_time = (remaining_items * average_processing_time)/60

        print(f"Processed {i+1}/{len(asset_list)} companies...")
        if estimated_remaining_time > 60:
            print(f"Estimated remaining time: {estimated_remaining_time/60:.2f} hours")
        else:
            print(f"Estimated remaining time: {estimated_remaining_time:.2f} minutes")
        

    context_results.append(contexts)
    results.append(company_execs)


In [None]:
query = f"""What is the name of the Chief Operating Officer of BNY Mellon?"""
urls = []
for url in search(query, pause=random.randint(2, 5), stop=2):
    urls.append(url)
print(len(urls))

In [None]:
results

In [None]:
make_csv(results, filename, header=None, verbose=True)