In [2]:
import instructor
import openai
from dotenv import load_dotenv
from rich import print

load_dotenv()
client = instructor.from_openai(openai.OpenAI())

# Load climate innovation open letter

In [3]:
with open("../data/LCAW Climate Innovation Open Letter - Public.txt") as f:
    source_text = f.read()
    
source_text_subset = source_text.split("\n")[-10:]
print(source_text_subset)

# Extract signatories (person, role, organisation)

In [4]:
from typing import List
from pydantic import BaseModel


class Signatory(BaseModel):
    person: str
    role: str
    organisation: str

class ExtractedSignatories(BaseModel):
    signatories: List[Signatory]

In [5]:
from llm_company_scrape.cache import instructor_cache

@instructor_cache
def extract_signatories(source_text: str) -> ExtractedSignatories: 
    signatories = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0.0,
        response_model=ExtractedSignatories,
        stream=False,
        messages=[
            {
                "role": "system",
                "content": "Extract records from the below text. Ensure the outputs are valid according to the provided schema.",
            },
            {
                "role": "user",
                "content": source_text
            },
        ],
    )
    return signatories

In [6]:
result = extract_signatories(source_text)

In [7]:
import pandas as pd

df = pd.DataFrame([dict(s) for s in result.signatories])
df

Unnamed: 0,person,role,organisation
0,Stephen Murphy,Founder,ClimateImpact
1,Charlie Mercer,Deputy Policy Director,Startup Coalition
2,Alyssa Gilbert,Director of Innovation,Grantham Institute
3,Sarah Mackintosh,Director,Cleantech for UK
4,Sammy Fry,Head of Climate,Tech Nation
...,...,...,...
201,Veronica Chou,President,Novel Fashion Holdings
202,Vivian Bertseka,Co-Founder,BlueLayer
203,Wei Ng,Senior Advisor,Barka Fund
204,Will Milligan,Founder & CEO,Extracellular


# Search organisations with Tavily

In [8]:
from tavily import TavilyClient
import os

tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])

In [9]:
from typing import Optional

class TavilyResult(BaseModel):
    title: str
    url: str
    score: float
    raw_content: Optional[str]
    
class TavilySearchResponse(BaseModel):
    query: str
    answer: str
    results: list[TavilyResult]
    response_time: float

In [10]:
@instructor_cache
def tavily_search_company(query: str) -> TavilySearchResponse:
    response = tavily_client.search(query=query, include_answer=True)
    try:
        return TavilySearchResponse.model_validate(response)
    except e:
        print(response)
        raise e

In [11]:
queries = {org: f"What does {org} in the UK do?" for org in df["organisation"].to_list()}
queries

{'ClimateImpact': 'What does ClimateImpact in the UK do?',
 'Startup Coalition': 'What does Startup Coalition in the UK do?',
 'Grantham Institute': 'What does Grantham Institute in the UK do?',
 'Cleantech for UK': 'What does Cleantech for UK in the UK do?',
 'Tech Nation': 'What does Tech Nation in the UK do?',
 'Tech Zero': 'What does Tech Zero in the UK do?',
 'techUK': 'What does techUK in the UK do?',
 'Compare Ethics': 'What does Compare Ethics in the UK do?',
 'Vuala': 'What does Vuala in the UK do?',
 'Switchee Limited': 'What does Switchee Limited in the UK do?',
 'ADT - Adventure Driven Threads': 'What does ADT - Adventure Driven Threads in the UK do?',
 'Airex': 'What does Airex in the UK do?',
 'ClimateAligned': 'What does ClimateAligned in the UK do?',
 'Fuse Energy': 'What does Fuse Energy in the UK do?',
 'Sourceful': 'What does Sourceful in the UK do?',
 'Zayndu': 'What does Zayndu in the UK do?',
 'Top Tier Impact': 'What does Top Tier Impact in the UK do?',
 'Future 

In [12]:
from tqdm import tqdm

In [13]:
company_results = dict()
for org, query in tqdm(queries.items()):
    company_results[org] = tavily_search_company(query=query)

  0%|          | 0/198 [00:00<?, ?it/s]

100%|██████████| 198/198 [08:01<00:00,  2.43s/it]


In [14]:
def get_tavily_answer(org: str):
    return company_results[org].answer

In [15]:
def get_source_texts(org: str):
    return " ".join(
        [
            f"[{i}] {x.title} ({x.url})"
            for i, x in enumerate(company_results[org].results)
        ]
    )

In [16]:
print(get_source_texts("ClimateImpact"))

In [17]:
df["company_summary"] = df["organisation"].apply(get_tavily_answer)
df["source_text"] = df["organisation"].apply(get_source_texts)
df["full_text"] = df["company_summary"] + " " + df["source_text"]
df

Unnamed: 0,person,role,organisation,company_summary,source_text,full_text
0,Stephen Murphy,Founder,ClimateImpact,ClimateImpact in the UK focuses on monitoring ...,[0] Climate change in the UK - Met Office (htt...,ClimateImpact in the UK focuses on monitoring ...
1,Charlie Mercer,Deputy Policy Director,Startup Coalition,The Startup Coalition in the UK works to make ...,[0] Startup Coalition (https://startupcoalitio...,The Startup Coalition in the UK works to make ...
2,Alyssa Gilbert,Director of Innovation,Grantham Institute,The Grantham Institute in the UK is a multidis...,[0] Grantham Institute - Climate Change and th...,The Grantham Institute in the UK is a multidis...
3,Sarah Mackintosh,Director,Cleantech for UK,Cleantech for UK is an initiative that aims to...,[0] Cleantech for UK (https://www.cleantechfor...,Cleantech for UK is an initiative that aims to...
4,Sammy Fry,Head of Climate,Tech Nation,Tech Nation in the UK is a leading growth plat...,[0] Global Talent Visa | Tech Nation (https://...,Tech Nation in the UK is a leading growth plat...
...,...,...,...,...,...,...
201,Veronica Chou,President,Novel Fashion Holdings,"Novel Fashion Holdings, led by Veronica S. Cho...",[0] Veronica S. Chou of Novel Fashion Holdings...,"Novel Fashion Holdings, led by Veronica S. Cho..."
202,Vivian Bertseka,Co-Founder,BlueLayer,BlueLayer in the UK acts as the software back ...,[0] BlueLayer is building the operating system...,BlueLayer in the UK acts as the software back ...
203,Wei Ng,Senior Advisor,Barka Fund,Barka Fund in the UK is a non-for-profit organ...,[0] Our mission - Barka UK charity (https://ba...,Barka Fund in the UK is a non-for-profit organ...
204,Will Milligan,Founder & CEO,Extracellular,Extracellular matrix in the UK is a dynamic 3-...,[0] Cell Structure and Function | British Soci...,Extracellular matrix in the UK is a dynamic 3-...


# Extracting more structured fields

In [18]:
from pydantic import Field


class StructuredCompanySummary(BaseModel):
    organisation_type: str = Field(
        ...,
        description="One sentence summary of the type of organisation. "
        "For example, startup, research institute, venture captial, consultancy.",
    )
    industry_domain: str = Field(
        ...,
        description="One sentence summary of the industry the organisation is applied to. "
        "For example, carbon accounting, meat alternatives, clean energy, petrochemicals",
    )
    functional_expertise: str = Field(
        ...,
        description="One sentence summary of the technical domain or foundational technology the organisation is based on,"
        "that would give an idea of the likely work. For example, synthetic biology, geospatial analytics, financing.",
    )

In [19]:
from llm_company_scrape.cache import instructor_cache


@instructor_cache
def extract_structured_company_summary(source_text: str) -> StructuredCompanySummary:
    summary = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.0,
        response_model=StructuredCompanySummary,
        stream=False,
        messages=[
            {
                "role": "system",
                "content": "Extract relevant information using the provided summary and the source title/urls.",
            },
            {"role": "user", "content": source_text},
        ],
    )
    return summary



In [20]:
structured_summary = dict()
for d in tqdm(df[["organisation", "full_text"]].to_dict(orient="records")):
    structured_summary[d["organisation"]] = extract_structured_company_summary(
        source_text=d["full_text"]
    )

100%|██████████| 206/206 [03:27<00:00,  1.01s/it]


In [21]:
def get_organisation_type(org: str):
    return structured_summary[org].organisation_type


def get_industry_domain(org: str):
    return structured_summary[org].industry_domain


def get_functional_expertise(org: str):
    return structured_summary[org].functional_expertise

In [22]:
df["organisation_type"] = df["organisation"].apply(get_organisation_type)
df["industry_domain"] = df["organisation"].apply(get_industry_domain)
df["functional_expertise"] = df["organisation"].apply(get_functional_expertise)
df

Unnamed: 0,person,role,organisation,company_summary,source_text,full_text,organisation_type,industry_domain,functional_expertise
0,Stephen Murphy,Founder,ClimateImpact,ClimateImpact in the UK focuses on monitoring ...,[0] Climate change in the UK - Met Office (htt...,ClimateImpact in the UK focuses on monitoring ...,non-profit organization,climate change monitoring and advocacy,climate impact assessment and public health an...
1,Charlie Mercer,Deputy Policy Director,Startup Coalition,The Startup Coalition in the UK works to make ...,[0] Startup Coalition (https://startupcoalitio...,The Startup Coalition in the UK works to make ...,advocacy organization,technology startups and scaleups,policy development and advocacy for tech innov...
2,Alyssa Gilbert,Director of Innovation,Grantham Institute,The Grantham Institute in the UK is a multidis...,[0] Grantham Institute - Climate Change and th...,The Grantham Institute in the UK is a multidis...,multidisciplinary research institute,climate change and environmental policy,policy-relevant research and education on clim...
3,Sarah Mackintosh,Director,Cleantech for UK,Cleantech for UK is an initiative that aims to...,[0] Cleantech for UK (https://www.cleantechfor...,Cleantech for UK is an initiative that aims to...,initiative,cleantech,green innovation and technology
4,Sammy Fry,Head of Climate,Tech Nation,Tech Nation in the UK is a leading growth plat...,[0] Global Talent Visa | Tech Nation (https://...,Tech Nation in the UK is a leading growth plat...,growth platform,technology,"supporting tech startups with insights, connec..."
...,...,...,...,...,...,...,...,...,...
201,Veronica Chou,President,Novel Fashion Holdings,"Novel Fashion Holdings, led by Veronica S. Cho...",[0] Veronica S. Chou of Novel Fashion Holdings...,"Novel Fashion Holdings, led by Veronica S. Cho...",sustainable fashion company,fashion,eco-friendly brand development
202,Vivian Bertseka,Co-Founder,BlueLayer,BlueLayer in the UK acts as the software back ...,[0] BlueLayer is building the operating system...,BlueLayer in the UK acts as the software back ...,software company,carbon credits management,software platform development for carbon proje...
203,Wei Ng,Senior Advisor,Barka Fund,Barka Fund in the UK is a non-for-profit organ...,[0] Our mission - Barka UK charity (https://ba...,Barka Fund in the UK is a non-for-profit organ...,non-profit organization,social services for migrants,"social integration, education, and entrepreneu..."
204,Will Milligan,Founder & CEO,Extracellular,Extracellular matrix in the UK is a dynamic 3-...,[0] Cell Structure and Function | British Soci...,Extracellular matrix in the UK is a dynamic 3-...,research institute,cell biology,extracellular matrix research and analysis


# Embedding helper

In [23]:
import requests_cache

session = requests_cache.CachedSession(allowable_methods=["GET", "POST"])

In [24]:
import numpy as np
def embed(session, texts: list[str], model:str="text-embedding-3-small"):
    
    url = "https://api.openai.com/v1/embeddings"
    headers = {
        "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}",
        "Content-Type": "application/json"
    }
    data = {
        "input": texts,
        "model": model,
        "encoding_format": "float"
    }
    response = session.post(url, headers=headers, json=data)
    response.raise_for_status()
    data = response.json()
    
    embeddings = []
    for item in data['data']:
        embedding = np.array(item['embedding'])
        embeddings.append(embedding)
    
    return np.array(embeddings)

In [25]:
def batched_texts(df:pd.DataFrame, column:str, chunk_size:int=64):
    num_rows = len(df)
    return [df[column][i:i+chunk_size].tolist() for i in range(0, num_rows, chunk_size)]

In [26]:
company_summary_batches = batched_texts(df, "company_summary")

In [27]:
embed(session, company_summary_batches[0])

array([[ 0.01160097,  0.0042803 ,  0.08013445, ..., -0.01944869,
         0.02348222,  0.02327506],
       [-0.01339705, -0.00353423,  0.02531439, ..., -0.0020033 ,
        -0.00867906,  0.01474017],
       [-0.03616762, -0.01020707,  0.05602296, ..., -0.01018923,
        -0.01168136,  0.02155552],
       ...,
       [-0.01694644,  0.0169712 ,  0.0272579 , ..., -0.01463163,
         0.05733815,  0.00518358],
       [-0.00509415,  0.00195856,  0.04654973, ...,  0.01270056,
        -0.01992096,  0.01208673],
       [ 0.01946929,  0.04593011,  0.04157596, ...,  0.00850304,
        -0.01332371, -0.00614247]])

In [28]:
def embed_df_col(df: pd.DataFrame, column:str):
    batches = batched_texts(df, column)
    embs=[]
    for batch in tqdm(batches):
        embs.append(embed(session, batch))
    return np.concatenate(embs, axis=0)

In [29]:
embeddings = {
    col: embed_df_col(df, col) for col in ["company_summary", "organisation_type","industry_domain","functional_expertise"]
}

100%|██████████| 4/4 [00:02<00:00,  1.35it/s]
100%|██████████| 4/4 [00:02<00:00,  1.42it/s]
100%|██████████| 4/4 [00:03<00:00,  1.27it/s]
100%|██████████| 4/4 [00:03<00:00,  1.22it/s]


# Dimensionality reduction

In [30]:
import umap
reducer = umap.UMAP()

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
umap_emb = {col: reducer.fit_transform(emb) for col, emb in embeddings.items()}

In [32]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [42]:
# Function to wrap text
def wrap_text(text, max_length=50):
    words = text.split()
    lines = []
    current_line = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 <= max_length:
            current_line.append(word)
            current_length += len(word) + 1
        else:
            lines.append(' '.join(current_line))
            current_line = [word]
            current_length = len(word)

    if current_line:
        lines.append(' '.join(current_line))
    return '<br>'.join(lines)

# Create a new column with wrapped company summaries
df['wrapped_summary'] = df['company_summary'].apply(wrap_text)
# Create subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=list(umap_emb.keys()))

# Define colors for each subplot
colors = ['blue', 'red', 'green', 'purple']

# Create scatter plots for each embedding
for i, (col, emb) in enumerate(umap_emb.items(), start=1):
    row = (i-1) // 2 + 1
    col = (i-1) % 2 + 1
    
    scatter = go.Scatter(
        x=emb[:, 0],
        y=emb[:, 1],
        mode='markers',
        marker=dict(color=colors[i-1], size=5, opacity=0.6),
        text=[f"<b>{org}</b><br>Type: {org_type}<br>Industry: {industry}<br>Expertise: {expertise}<br><br>{summary}" 
              for org, org_type, industry, expertise, summary in 
              zip(df['organisation'], df['organisation_type'], df['industry_domain'], df['functional_expertise'], df['wrapped_summary'])],
        hoverinfo='text',
        hovertemplate="%{text}",
        name=col
    )
    
    fig.add_trace(scatter, row=row, col=col)

# Update layout
fig.update_layout(height=800, width=1000, title_text="UMAP Embeddings")
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)

# Show the plot
fig.show()


In [34]:
df


Unnamed: 0,person,role,organisation,company_summary,source_text,full_text,organisation_type,industry_domain,functional_expertise
0,Stephen Murphy,Founder,ClimateImpact,ClimateImpact in the UK focuses on monitoring ...,[0] Climate change in the UK - Met Office (htt...,ClimateImpact in the UK focuses on monitoring ...,non-profit organization,climate change monitoring and advocacy,climate impact assessment and public health an...
1,Charlie Mercer,Deputy Policy Director,Startup Coalition,The Startup Coalition in the UK works to make ...,[0] Startup Coalition (https://startupcoalitio...,The Startup Coalition in the UK works to make ...,advocacy organization,technology startups and scaleups,policy development and advocacy for tech innov...
2,Alyssa Gilbert,Director of Innovation,Grantham Institute,The Grantham Institute in the UK is a multidis...,[0] Grantham Institute - Climate Change and th...,The Grantham Institute in the UK is a multidis...,multidisciplinary research institute,climate change and environmental policy,policy-relevant research and education on clim...
3,Sarah Mackintosh,Director,Cleantech for UK,Cleantech for UK is an initiative that aims to...,[0] Cleantech for UK (https://www.cleantechfor...,Cleantech for UK is an initiative that aims to...,initiative,cleantech,green innovation and technology
4,Sammy Fry,Head of Climate,Tech Nation,Tech Nation in the UK is a leading growth plat...,[0] Global Talent Visa | Tech Nation (https://...,Tech Nation in the UK is a leading growth plat...,growth platform,technology,"supporting tech startups with insights, connec..."
...,...,...,...,...,...,...,...,...,...
201,Veronica Chou,President,Novel Fashion Holdings,"Novel Fashion Holdings, led by Veronica S. Cho...",[0] Veronica S. Chou of Novel Fashion Holdings...,"Novel Fashion Holdings, led by Veronica S. Cho...",sustainable fashion company,fashion,eco-friendly brand development
202,Vivian Bertseka,Co-Founder,BlueLayer,BlueLayer in the UK acts as the software back ...,[0] BlueLayer is building the operating system...,BlueLayer in the UK acts as the software back ...,software company,carbon credits management,software platform development for carbon proje...
203,Wei Ng,Senior Advisor,Barka Fund,Barka Fund in the UK is a non-for-profit organ...,[0] Our mission - Barka UK charity (https://ba...,Barka Fund in the UK is a non-for-profit organ...,non-profit organization,social services for migrants,"social integration, education, and entrepreneu..."
204,Will Milligan,Founder & CEO,Extracellular,Extracellular matrix in the UK is a dynamic 3-...,[0] Cell Structure and Function | British Soci...,Extracellular matrix in the UK is a dynamic 3-...,research institute,cell biology,extracellular matrix research and analysis


# Clustering embeddings

In [35]:
embeddings["company_summary"].shape

(206, 1536)