# Enhancing, Expanding, and Analyzing ReFED's Capital Tracker

In [None]:
import numpy as np 
import pandas as pd 
import requests
import json
import time
import re
import selenium
from bs4 import BeautifulSoup

import plotly.graph_objects as go
import plotly.express as px

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

## 1. Scraping the ReFED Capital Tracker 
##### We begin by grabbing the data from [this table](https://insights-engine.refed.org/capital-tracker/list?dateFrom=2012-01-01&dateTo=2025-02-19&list.page=1&list.searchScope[]=funder_name,funder_desc,recipient_name,recipient_desc&list.sortBy=name&list.view=investments). This involves setting up a scraper with an operation to page through all 58 pages of the table; the `selenium` package works well for this.

##### Perform the scrape

In [None]:
# Initialize empty list to be populated
x = []
     
# Positions of the pagination buttons to be "clicked" in sequential order     
    # NOTE: This list will need to be manually adjusted once the number of pages exceed 58
clicks = sum([list(range(0, 9)), [7] * 42, list(range(8, 14))], [])

# Basic scraper setup 
driver = webdriver.Firefox()
raw = driver.get("https://insights-engine.refed.org/capital-tracker/list?dateFrom=2012-01-01&dateTo=2025-02-19&list.page=1&list.searchScope[]=funder_name,funder_desc,recipient_name,recipient_desc&list.sortBy=name&list.view=investments")

for i in clicks:

    # Automating selenium to click the appropriate button to go to the next page of the table 
    pages = driver.find_elements(By.CLASS_NAME, 'pagination__item')
    driver.execute_script("arguments[0].click();", pages[i])

    # Pause to allow page to load before scraping data 
    time.sleep(3)

    # Find all rows on a given page of the table
    rows = driver.find_elements(By.CLASS_NAME, "table2--row")

    # For each row...
    for row in rows:    

        rowdata = []

        # Find all cells in the row and add the data to rowdata 
        cells = row.find_elements(By.CLASS_NAME, "table2--cell")
        for cell in cells:
            text = cell.get_attribute('innerText')
            rowdata.append(text)

        # add the full row of data to the master list 
        x.append(rowdata)

driver.close()

# master list to df
df = pd.DataFrame(x)

df.shape

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


(5765, 13)

##### Clean the scraped data

In [114]:
# Rename columns using header row
df.columns = df.iloc[1]

# Exclude rows with null dates and duplicates of the header row
df = df.loc[(df['DATE'] != 'DATE') & (df['DATE'] != '')].dropna(subset = ['DATE'])

# Exclude null columns
df = df[df.columns[~df.columns.isnull()]].reset_index()

df.shape 

(5651, 14)

In [115]:
df.head()

1,index,DATE,RECIPIENT,DEAL SIZE,FUNDER(S),DEAL DESCRIPTION,COMPANY DESCRIPTION,SOLUTION TYPE(S),SOLUTION,CAPITAL TYPE,FUNDING GROUP,INVESTMENT TYPE,DEAL STAGE,APPLICABILITY
0,2,2-18-2025,Ripe.io,n.a.,n.a.,The company is no longer actively in business ...,Developer of a distributed supply chain softwa...,Prevention,Inventory Traceability,Private,Corporate Finance & Spending,Out of Business,,Direct
1,3,2-15-2025,Chomp,n.a.,Martin Energy Group,The company was acquired by Martin Energy Grou...,"Manufacturer of patented, and containerized an...",Recycling,,Private,Corporate Finance & Spending,Merger/Acquisition,,Direct
2,4,2-13-2025,Nabaco,"$1,200,000",n.a.,The company raised $1.2 million of Seed-6 fund...,Manufacturer of fruit protection system design...,General,,Private,Venture Capital,Seed Round,Seed Round,Direct
3,5,1-28-2025,Knead,"$553,171",Business Development Bank of Canada\nGrowthX C...,"The company raised CAD 800,000 of seed funding...",Operator of a food recovery technology platfor...,Rescue,Donation Coordination & Matching,Private,Venture Capital,Seed Round,Seed Round,Direct
4,6,1-8-2025,SCO2,n.a.,StartLife,The company joined StartLife as a part of it's...,Developer of a food and agricultural waste rec...,Recycling,Waste-Derived Biomaterials,Private,Venture Capital,Accelerator/Incubator,,Direct


##### Take a quick look at the Solution column

In [227]:
print(df['SOLUTION'].value_counts())

print('Number of Categories: ' + str(len(df['SOLUTION'].value_counts()) - 1))

SOLUTION
                                                    3738
Gleaning                                             222
Manufacturing Byproduct Utilization (Upcycling)      192
Meal Kits                                            129
Waste-Derived Biomaterials                            93
Insect Farming                                        90
Centralized Composting                                76
Edible Coatings                                       67
Precision Food Safety                                 66
Waste-Derived Bio-Plastics                            63
Active & Intelligent Packaging                        63
Inventory Traceability                                62
Imperfect & Surplus Produce Channels                  61
Waste-Derived Agricultural Inputs                     60
Donation Transportation                               58
Centralized Anaerobic Digestion                       52
Enhanced Demand Planning                              49
Community Composting  

## 2. Predicting Missing `Solution` Categories 

##### The table we just scraped has a useful and fairly detailed 'Solution' column which classifies investments in 46 discrete categories. However, this field is missing for approximately two-thirds of the investments. 

##### Next we'll apply `BERTopic`, the topic modeling framework based on a prominent language model (BERT), to predict the missing `Solution` values. Specifically, we will deploy BERTopic as a supervised model which we will train on `Company Description`, a field in our table that contains unstructured text describing the recipient of each investment.

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

from sentence_transformers import SentenceTransformer

##### Define training and response data

In [614]:
len(df[['COMPANY DESCRIPTION']].loc[(df['COMPANY DESCRIPTION'] != '') & (df['COMPANY DESCRIPTION'].notna()) & (df['SOLUTION'] != '')])

1793

In [None]:
train = df[['COMPANY DESCRIPTION']].loc[(df['SOLUTION'] != '') & (df['COMPANY DESCRIPTION'].notna())]
y = df[['SOLUTION']].loc[(df['SOLUTION'] != '') & (df['COMPANY DESCRIPTION'].notna())]

# Convert the string labels in 'y' to numerical labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Convert string labels to numerical labels

In [None]:
# Evaluate sparsity of Solution categories
y['SOLUTION'].value_counts()

SOLUTION
Manufacturing Byproduct Utilization (Upcycling)     192
Meal Kits                                           129
Gleaning                                            127
Waste-Derived Biomaterials                           93
Insect Farming                                       90
Centralized Composting                               75
Edible Coatings                                      67
Precision Food Safety                                66
Waste-Derived Bio-Plastics                           63
Active & Intelligent Packaging                       63
Inventory Traceability                               62
Imperfect & Surplus Produce Channels                 61
Waste-Derived Agricultural Inputs                    60
Centralized Anaerobic Digestion                      49
Enhanced Demand Planning                             49
Donation Transportation                              44
Temperature Monitoring (Pallet Transport)            39
Donation Coordination & Matching       

##### Oversample to address sparsity

In [None]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(train, y_encoded)

##### Pre-calculate embeddings

Because BERT and BERTopic rely on document-based embeddings (rather than simple word embeddings), it is not necessary to conduct most of the typical NLP pre-processing (e.g., stemming, lemmatizing, tokenizing, etc.) In fact, [BERTopic documentation warns that these steps can actually undermine the efficacy of the model](https://maartengr.github.io/BERTopic/faq.html#should-i-preprocess-the-data).

However, because determining the document embeddings is cost-intensive, and since we are going to be testing different hyperparameters iteratively, calculating the embeddings ahead of time will drastically speed things up. 

In [250]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(list(X_train))

##### Test supervised BERTopic model, iterating over various hyperparameter combinations

Hyperparameters of interest:

- N-Gram Range: The range of the number of discrete words that BERTopic will evaluate as a single token 

- Top N Words: The max number of words that will be used to construct each topic.

In [366]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state = 2525, test_size=0.2)

In [502]:
X_test_newindex = X_test.reset_index(names = 'old_index')

for n_gram_range_val in [(1, 1), (1, 2), (1, 3)]:
    for top_n_words_val in [5, 10, 15]:

        # Create a supervised BERTopic instance
        topic_model= BERTopic(
                embedding_model = embedding_model,
                umap_model = empty_dimensionality_model,
                hdbscan_model = clf,
                ctfidf_model = ctfidf_model, 
                low_memory = True,
                top_n_words = top_n_words_val,
                n_gram_range = n_gram_range_val
        )

        # Train model for each iteration 
        topics, probs = topic_model.fit_transform(X_train['COMPANY DESCRIPTION'], y = y_train)
                                                
        topic_model_out = topic_model.get_topic_info()

        # Generate and save predicted values 
        all_preds = []

        for i in range(len(X_test_newindex)):
            topic = re.sub(r'\W','',str(topic_model.transform(X_test_newindex['COMPANY DESCRIPTION'].iloc[i])[0]))
            match = (topic == str(y_test[i]))
            all_preds.append(match)

        # # Predicted categories are generated as numbers. Decode back to the actual category. 
        # topic_dict = dict(zip(list(topic_model_out['Topic']),list(df['SOLUTION'].value_counts().index[1:len(df['SOLUTION'].value_counts())])))

        # df['BERTopic_LIKELY_TOPIC_name'] = df['BERTopic_LIKELY_TOPIC'].astype(int).replace(topic_dict)

        # # Dummy indicating whether the predicted category was accurate 
        # df['MATCH'] = df['BERTopic_LIKELY_TOPIC_name'] == df['SOLUTION']

        # # Accuracy of each iteration 
        # score = df['MATCH'].loc[df['SOLUTION'] != ''].value_counts()

        print('N-Gram Range: ' + str(n_gram_range_val) + '; Top N Words ' + str(top_n_words_val))
        print(pd.DataFrame(all_preds).value_counts())

N-Gram Range: (1, 1); Top N Words 5
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 1); Top N Words 10
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 1); Top N Words 15
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 2); Top N Words 5
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 2); Top N Words 10
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 2); Top N Words 15
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 3); Top N Words 5
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 3); Top N Words 10
0    
False    25482
True      1158
Name: count, dtype: int64
N-Gram Range: (1, 3); Top N Words 15
0    
False    25482
True      1158
Name: count, dtype: int64


##### Select and train final model
In this case, all of the models performed identically on the test set. That suggests the exact selection of hyperparameters has little effect; we will therefore revert to the defaults.

In [505]:
topic_model = BERTopic(
                umap_model = empty_dimensionality_model,
                hdbscan_model = clf,
                ctfidf_model = ctfidf_model, 
                low_memory = True
        )

In [506]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(list(X_resampled))

In [None]:
# Train model 
topics, probs = topic_model.fit_transform(X_resampled['COMPANY DESCRIPTION'], y = y_resampled)

##### Create dictionary to re-map encoded solutions to names

In [None]:
topic_output = topic_model.get_topic_info()

topic_names = []

for i in range(len(topic_output['Representative_Docs'])):
    doc_clean = re.sub(r'\[|\]','',topic_output['Representative_Docs'][i][1])
    temp_df = df[['SOLUTION']].loc[df['COMPANY DESCRIPTION'].str.contains(doc_clean)].reset_index()
    sol_name = temp_df['SOLUTION'][1]
    topic_names.append(sol_name)

topic_output['Topic Name'] = topic_names

topic_output

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Topic Name
0,0,2834,0_millions_struggle_education_hunger,"[millions, struggle, education, hunger, 400, s...",[City Harvest is New York's first and largest ...,1 SOLUTION 1032 1086 1087...
1,1,2834,1_bioplastic_biofuel_paper_plastics,"[bioplastic, biofuel, paper, plastics, polymer...",[Developer of a natural biodegradable polymer ...,1 SOLUTION 239 Waste-D...
2,2,2834,2_portable_ultraviolet_disinfects_viruses,"[portable, ultraviolet, disinfects, viruses, c...",[Developer of a portable device designed to us...,1 SOLUTION 98 Smart Home Devi...
3,3,2834,3_gleaning_fields_aggregation_2004,"[gleaning, fields, aggregation, 2004, wasteful...",[We are committed to bridging a critical gap i...,1 SOLUTION 1051 Gleaning 1069 Gleaning 1...
4,4,2834,4_shipping_errors_delay_efficiency,"[shipping, errors, delay, efficiency, automati...",[Developer of a business intelligence and deli...,1 SOLUTION 111 Intelligent Routing
5,5,2834,5_feed_method_bioconversion_biodiesel,"[feed, method, bioconversion, biodiesel, anima...",[Developer of novel bioconversion methods inte...,1 SOLUTION 3040 Livestock Feed 4692...
6,6,2834,6_anaerobic_digestion_digester_projects,"[anaerobic, digestion, digester, projects, was...",[Blue Sphere Corp is an international clean-te...,1 SOLUTION 3105 Ce...
7,7,2834,7_temperature_email_encrypt_gateway,"[temperature, email, encrypt, gateway, frequen...",[Developer of temperature monitoring platform ...,1 SOLUTION 145...
8,8,2834,8_compost_la_soil_scrap,"[compost, la, soil, scrap, collection, lawn, s...","[Operator of commercial waste, recycling and c...",1 SOLUTION 2467 Centralized...
9,9,2834,9_wastewater_water_offsite_uptime,"[wastewater, water, offsite, uptime, bills, tr...",[Developer of wastewater management digesters ...,1 SOLUT...


##### Predict solution categories for full dataset 

In [None]:
# Subset with non-missing Company Description
df_new = df.loc[(df['COMPANY DESCRIPTION'].notna())].reset_index()

solution = df['SOLUTION'].loc[(df['COMPANY DESCRIPTION'].notna())]
sol_encoded = le.fit_transform(solution) 

# Init empty list of predictions
all_preds = []

for i in range(len(df_new)):
    topic = re.sub(r'\W','',str(topic_model.transform(df_new['COMPANY DESCRIPTION'][i])[0]))
    all_preds.append(topic)

df_new['Predicted Solution'] = all_preds

# # Predicted categories are generated as numbers. Decode back to the actual category. 
# topic_dict = dict(zip(list(topic_model_out['Topic']),list(df['SOLUTION'].value_counts().index[1:len(df['SOLUTION'].value_counts())])))

# df['BERTopic_LIKELY_TOPIC_name'] = df['BERTopic_LIKELY_TOPIC'].astype(int).replace(topic_dict)

# # Dummy indicating whether the predicted category was accurate 
# df['MATCH'] = df['BERTopic_LIKELY_TOPIC_name'] == df['SOLUTION']

# # Accuracy of each iteration 
# score = df['MATCH'].loc[df['SOLUTION'] != ''].value_counts()

# print('N-Gram Range: ' + str(n_gram_range_val) + '; Top N Words ' + str(top_n_words_val))
# print(score)

In [549]:
df_new.head()

1,level_0,index,DATE,RECIPIENT,DEAL SIZE,FUNDER(S),DEAL DESCRIPTION,COMPANY DESCRIPTION,SOLUTION TYPE(S),SOLUTION,CAPITAL TYPE,FUNDING GROUP,INVESTMENT TYPE,DEAL STAGE,APPLICABILITY,Original Solution,Predicted Solution
0,0,2,2-15-2025,Chomp,n.a.,Martin Energy Group,The company was acquired by Martin Energy Grou...,"Manufacturer of patented, and containerized an...",Recycling,,Private,Corporate Finance & Spending,Merger/Acquisition,,Direct,0,6
1,1,3,1-28-2025,Knead,"$553,171",Business Development Bank of Canada\nGrowthX C...,"The company raised CAD 800,000 of seed funding...",Operator of a food recovery technology platfor...,Rescue,Donation Coordination & Matching,Private,Venture Capital,Seed Round,Seed Round,Direct,9,24
2,2,4,1-6-2025,Dyrt,"$4,711,998",n.a.,The company raised $4.71 million through a com...,Developer of an organic waste diversion platfo...,Recycling,,Private,Venture Capital,Seed Round,Seed Round,Direct,0,37
3,3,5,1-1-2025,Earnest,"$1,200,000",Ag Ventures Alliance\nAgLaunch\nSOSV,The company raised $1.2 million of venture fun...,Developer of an agricultural system and additi...,Recycling,Waste-Derived Agricultural Inputs,Private,Venture Capital,Later Stage VC,,Direct,43,26
4,4,6,12-16-2024,Brightly,"$2,300,000",Clear Current Capital\nCollaborative Fund\nG-F...,The company raised $2.3 million of seed fundin...,Operator of a food recovery company intended t...,Rescue,,Private,Venture Capital,Seed Round,Seed Round,Direct,0,0


##### Visualize the topic embeddings in a 2D space
This shows substantial overlap among many of the topics, which is likely driving much of the mislassification we see.

In [615]:
topic_model.visualize_topics() 

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
topic_dict = dict(zip(list(topic_model_out['Topic']),list(df['SOLUTION'].value_counts().index[1:len(df['SOLUTION'].value_counts())])))

df['BERTopic_LIKELY_TOPIC_name'] = df['BERTopic_LIKELY_TOPIC'].astype(int).replace(topic_dict)

df.head()

1,index,DATE,RECIPIENT,DEAL SIZE,FUNDER(S),DEAL DESCRIPTION,COMPANY DESCRIPTION,SOLUTION TYPE(S),SOLUTION,CAPITAL TYPE,FUNDING GROUP,INVESTMENT TYPE,DEAL STAGE,APPLICABILITY,BERTopic_LIKELY_TOPIC,BERTopic_LIKELY_TOPIC_name,MATCH
0,2,2-15-2025,Chomp,n.a.,Martin Energy Group,The company was acquired by Martin Energy Grou...,"Manufacturer of patented, and containerized an...",Recycling,,Private,Corporate Finance & Spending,Merger/Acquisition,,Direct,14,Centralized Anaerobic Digestion,False
1,3,1-28-2025,Knead,"$553,171",Business Development Bank of Canada\nGrowthX C...,"The company raised CAD 800,000 of seed funding...",Operator of a food recovery technology platfor...,Rescue,Donation Coordination & Matching,Private,Venture Capital,Seed Round,Seed Round,Direct,17,Donation Coordination & Matching,True
2,4,1-6-2025,Dyrt,"$4,711,998",n.a.,The company raised $4.71 million through a com...,Developer of an organic waste diversion platfo...,Recycling,,Private,Venture Capital,Seed Round,Seed Round,Direct,16,Community Composting,False
3,5,1-1-2025,Earnest,"$1,200,000",Ag Ventures Alliance\nAgLaunch\nSOSV,The company raised $1.2 million of venture fun...,Developer of an agricultural system and additi...,Recycling,Waste-Derived Agricultural Inputs,Private,Venture Capital,Later Stage VC,,Direct,12,Waste-Derived Agricultural Inputs,True
4,6,12-16-2024,Brightly,"$2,300,000",Clear Current Capital\nCollaborative Fund\nG-F...,The company raised $2.3 million of seed fundin...,Operator of a food recovery company intended t...,Rescue,,Private,Venture Capital,Seed Round,Seed Round,Direct,17,Donation Coordination & Matching,False


### 3. Plotting deal size by `Solution` category

##### Now we will use Solution category (the real one where available, or the predicted where none was originally listed) to create a visualization plotting investment size against solution type.

In [None]:
df.to_csv('refed_full.csv')

In [None]:
# Convert deal size to integer
df['DEAL_INT'] = df['DEAL SIZE'].str.replace(',', '').str.extract(r'([0-9]+)', expand = False).astype(float)

fig = go.Figure()

fig.add_trace(px.strip(
    x=df['DEAL_INT'], 
    y=df['BERTopic_LIKELY_TOPIC_name'], 

    marker=dict(
        size=12, 
        color='#cb1dd1', 
        opacity=0.8, 
        line=dict(width=1, color='black')
    ),
))

# Log-transform the x-axis
fig.update_xaxes(typ = "log")

## 4. Searching for Similar Capital Flows

##### Finally, we will build a semantic similarity model that looks for texts that are similar ReFED's descriptions of companies and investments. We'll deploy this model against against data from APIs and RSS feeds from various news/PR sources, in order to find news stories that may describe similar investments that might be of interest to ReFED.

In [None]:
# Init empty lists
all_content = []
all_headlines = []
all_links = []

##### Define a function that calls and rolls up APIs/RSS feeds from several news/press release services
- NewsAPI: This is a news aggregator service that pulls stories and metadata from thousands of international sources and aggregates them into its API. (Note: The current code uses the free version, which limits the number of results that can be pulled via API.)
- PR Newswire: This is a newswire service that publishes press releases from companies, governments, academic institutions, nonprofits, etc. It maintains separate RSS feeds for several topic areas of potential interest -- we will need to call them all separately.

(Note that if a tool like this would be worthwhile to ReFED, the organization might consider paid subscriptions to additional feeds/APIs from organizations that specialize in food/ag science/biotech. The present APIs were included mainly to provide proof of concept.)

In [None]:
def get_news():

    # NewsAPI
    headers = {'x-api-key': 'b2b12189dc8443bebddee191ee64d95c'}
    response_newsapi = requests.get("https://newsapi.org/v2/everything?q=(agriculture OR biotech OR food OR farm) AND (rescue OR waste) AND (venture OR investment OR acquires OR acquired OR 'seed funding' OR funds OR merger OR 'angel investor' OR incubator OR accelerator VC OR buyout)&language=en&sortBy=publishedAt", 
                            headers=headers) 

    # For each article, grab content, headline, and URL
    for i in range(len(response_newsapi.json()['articles'])):

        content = response_newsapi.json()['articles'][i]['content']
        all_content.append(content)

        headline = response_newsapi.json()['articles'][i]['title']
        all_headlines.append(headline)

        url = response_newsapi.json()['articles'][i]['url']
        all_links.append(url)

    # List of PR Newswire's topic-specific RSS feeds 
    response_env = requests.get("https://www.prnewswire.com/rss/environment-latest-news/environment-latest-news-list.rss")
    response_health = requests.get("https://www.prnewswire.com/rss/health-latest-news/health-latest-news-list.rss")
    response_policy = requests.get("https://www.prnewswire.com/rss/policy-public-interest-latest-news/policy-public-interest-latest-news-list.rss")                          
    response_tech = requests.get("https://www.prnewswire.com/rss/consumer-technology-latest-news/consumer-technology-latest-news-list.rss")

    responses = [response_env, response_health, response_policy, response_tech]

    # For each press release, grab content, headline, and URL
    for r in responses:
        soup = BeautifulSoup(r.content)

        items = soup.find_all('description')

        for item in items[1:21]: # Capturing the 20 news items that appear in each feed, while ignoring the first item which is a header
            content = item.text
            all_content.append(content)

        headlines = soup.find_all('title')

        for hd in headlines[1:21]:
            headline = hd.text 
            all_headlines.append(headline)

        links = soup.find_all('guid')

        for u in links:
            url = u.text 
            all_links.append(url)

    # Aggregate all items from all news/PR feeds 
    newsfeeds_new = pd.DataFrame({'Headline':all_headlines, 'Link':all_links, 'Content':all_content})
    
    # Save to parquet 
    newsfeeds_new.to_parquet('newsfeeds' + str(datetime.datetime.now()) + '.parquet')

    return newsfeeds_new

##### Automate hourly pulls from all of the news feeds 

In [None]:
# import schedule 
# schedule.every(1).hour.do(get_news)

# while True:
#    schedule.run_pending()
#    time.sleep(1)

##### Define the transformer model that we'll use to encode 

In [None]:
from sentence_transformers import SentenceTransformer, util 
model = SentenceTransformer("all-MiniLM-L6-v2")

##### Pull and aggregate all of the files from previous API/RSS pulls 

In [616]:
import glob

filelist = []

newsfeeds = get_news()

for file in glob.glob("prnewswire_rss*.parquet|newsfeeds*.parquet"):
    temp = pd.read_parquet(file)
    newsfeeds = pd.concat([newsfeeds, temp]).drop_duplicates

##### Calculate semantic similarity for each news article/press release

In [617]:
# Create column containing concatenation of `COMPANY DESCRIPTION` and `DEAL DESCRIPTION`
df['FULL_EMBED'] = df['COMPANY DESCRIPTION'] + '' + df['DEAL DESCRIPTION']

# Encode the concatenated column and the news articles
desc_embed = model.encode(df['FULL_EMBED'])
articles_embed = model.encode(newsfeeds['Content'].to_numpy())

# Calculate semantic similarity (using cosine similarity)
similarity = util.cos_sim(articles_embed, desc_embed)

In [618]:
# Append scores to df of news/press releases 
scores = []
for i in range(newsfeeds.shape[0]):
    scores.append(abs(similarity[i]).max())

newsfeeds['Similarity_Score'] = scores

newsfeeds.shape

(7975, 4)

In [619]:
# Sort so that highest-similarity articles appear first 
newsfeeds.sort_values(['Similarity_Score'], ascending = False).drop_duplicates('Headline').head(20)


Unnamed: 0,Headline,Link,Content,Similarity_Score
806,Circular Economy-Supporting Agreements - The E...,https://www.cleanthesky.com/innovation/waste-f...,The Waste Framework Directive has been provisi...,tensor(0.6761)
7959,DXC ernennt den ehemaligen COO der Federal Res...,https://www.prnewswire.com/news-releases/dxc-e...,"ASHBURN, Va., 27. Februar 2025 /PRNewswire/ --...",tensor(0.6613)
844,ONVY Raises Over $2M to Revolutionize AI-Power...,https://www.prnewswire.com/news-releases/onvy-...,"MUNICH, Feb. 26, 2025 /PRNewswire/ -- ONVY Hea...",tensor(0.6394)
7868,Putting pets at the paw-front,https://northern.starweekly.com.au/news/puttin...,Animal welfare organisations are being encoura...,tensor(0.6162)
112,American Packaging Corporation Advances Renewa...,https://www.prnewswire.com/news-releases/ameri...,White Paper on Material Options and Considerat...,tensor(0.6011)
110,Geared for GREEN - Circular Economy Solutions,https://www.prnewswire.com/news-releases/geare...,"MIAMI, Feb. 25, 2025 /PRNewswire/ -- Geared fo...",tensor(0.5976)
73,10 Ways To Take Sustainability to the Next Level,http://meetings.skift.com/2025/02/21/10-ways-t...,The next level of sustainability extends beyon...,tensor(0.5828)
7938,The Life You Can Save and Educate Girls Partne...,https://www.prnewswire.com/news-releases/the-l...,"Matched-funding campaign aims to raise $300,00...",tensor(0.5720)
1063,Madison Logic Launches ABM Certification Program,https://www.prnewswire.com/news-releases/madis...,New interactive course aims to simplify B2B ma...,tensor(0.5700)
7803,Ontario 'grocery outlet' sells food for a lot ...,https://www.blogto.com/eat_drink/2025/02/ontar...,With 13 locations across Ontario (and more to ...,tensor(0.5687)


In [620]:
newsfeeds.sort_values(['Similarity_Score'], ascending = False).drop_duplicates('Headline').to_csv('scores.csv')