# Enhancing, Expanding, and Analyzing ReFED's Capital Tracker

In [None]:
!pip install -r requirements.txt

In [4]:
import numpy as np
import pandas as pd
import requests
import json
import glob
import datetime
import time
import re
import selenium
from bs4 import BeautifulSoup

import plotly.graph_objects as go
import plotly.express as px

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import RandomOverSampler

from sentence_transformers import SentenceTransformer

## 1. Scraping the ReFED Capital Tracker
##### We begin by grabbing the data from [this table](https://insights-engine.refed.org/capital-tracker/list?dateFrom=2012-01-01&dateTo=2025-02-19&list.page=1&list.searchScope[]=funder_name,funder_desc,recipient_name,recipient_desc&list.sortBy=name&list.view=investments). This involves setting up a scraper with an operation to page through all 58 pages of the table; the `selenium` package works well for this.

##### Perform the scrape

In [None]:
# Initialize empty list to be populated
x = []

# Positions of the pagination buttons to be "clicked" in sequential order
clicks = sum([list(range(0, 9)), [7] * 42, list(range(8, 14))], [])

# Basic scraper setup
driver = webdriver.Firefox()
raw = driver.get("https://insights-engine.refed.org/capital-tracker/list?dateFrom=2012-01-01&dateTo=2025-02-19&list.page=1&list.searchScope[]=funder_name,funder_desc,recipient_name,recipient_desc&list.sortBy=name&list.view=investments")

for i in clicks:

    # Automating selenium to click the appropriate button to go to the next page of the table
    pages = driver.find_elements(By.CLASS_NAME, 'pagination__item')
    driver.execute_script("arguments[0].click();", pages[i])

    # Pause to allow page to load before scraping data
    time.sleep(3)

    # Find all rows on a given page of the table
    rows = driver.find_elements(By.CLASS_NAME, "table2--row")

    # For each row...
    for row in rows:

        rowdata = []

        # Find all cells in the row and add the data to rowdata
        cells = row.find_elements(By.CLASS_NAME, "table2--cell")
        for cell in cells:
            text = cell.get_attribute('innerText')
            rowdata.append(text)

        # add the full row of data to the master list
        x.append(rowdata)

driver.close()

# master list to df
df = pd.DataFrame(x)

##### Clean the scraped data

In [None]:
# Rename columns using header row
df.columns = df.iloc[1]

# Exclude rows with null dates and duplicates of the header row
df = df.loc[(df['DATE'] != 'DATE') & (df['DATE'] != '')].dropna(subset = ['DATE'])

# Exclude null columns
df = df[df.columns[~df.columns.isnull()]].reset_index()

df.shape

(5651, 14)

In [None]:
df.head()

1,index,DATE,RECIPIENT,DEAL SIZE,FUNDER(S),DEAL DESCRIPTION,COMPANY DESCRIPTION,SOLUTION TYPE(S),SOLUTION,CAPITAL TYPE,FUNDING GROUP,INVESTMENT TYPE,DEAL STAGE,APPLICABILITY
0,2,2-18-2025,Ripe.io,n.a.,n.a.,The company is no longer actively in business ...,Developer of a distributed supply chain softwa...,Prevention,Inventory Traceability,Private,Corporate Finance & Spending,Out of Business,,Direct
1,3,2-15-2025,Chomp,n.a.,Martin Energy Group,The company was acquired by Martin Energy Grou...,"Manufacturer of patented, and containerized an...",Recycling,,Private,Corporate Finance & Spending,Merger/Acquisition,,Direct
2,4,2-13-2025,Nabaco,"$1,200,000",n.a.,The company raised $1.2 million of Seed-6 fund...,Manufacturer of fruit protection system design...,General,,Private,Venture Capital,Seed Round,Seed Round,Direct
3,5,1-28-2025,Knead,"$553,171",Business Development Bank of Canada\nGrowthX C...,"The company raised CAD 800,000 of seed funding...",Operator of a food recovery technology platfor...,Rescue,Donation Coordination & Matching,Private,Venture Capital,Seed Round,Seed Round,Direct
4,6,1-8-2025,SCO2,n.a.,StartLife,The company joined StartLife as a part of it's...,Developer of a food and agricultural waste rec...,Recycling,Waste-Derived Biomaterials,Private,Venture Capital,Accelerator/Incubator,,Direct


##### Take a quick look at the Solution column

In [None]:
print(df['SOLUTION'].value_counts())

print('Number of Categories: ' + str(len(df['SOLUTION'].value_counts()) - 1))

SOLUTION
                                                    3738
Gleaning                                             222
Manufacturing Byproduct Utilization (Upcycling)      192
Meal Kits                                            129
Waste-Derived Biomaterials                            93
Insect Farming                                        90
Centralized Composting                                76
Edible Coatings                                       67
Precision Food Safety                                 66
Waste-Derived Bio-Plastics                            63
Active & Intelligent Packaging                        63
Inventory Traceability                                62
Imperfect & Surplus Produce Channels                  61
Waste-Derived Agricultural Inputs                     60
Donation Transportation                               58
Centralized Anaerobic Digestion                       52
Enhanced Demand Planning                              49
Community Composting  

## 2. Predicting Missing `Solution` Categories

##### The table we just scraped has a useful and fairly detailed 'Solution' column which classifies investments in 46 discrete categories. However, this field is missing for approximately two-thirds of the investments.

##### Next we'll apply `BERTopic`, the topic modeling framework based on a prominent language model (BERT), to predict the missing `Solution` values. Specifically, we will deploy BERTopic as a supervised model which we will train on `Company Description`, a field in our table that contains unstructured text describing the recipient of each investment.

In [5]:
df = pd.read_parquet('refed.parquet')

##### Define training and response data

In [6]:
df_reduced = df[['COMPANY DESCRIPTION', 'SOLUTION']].loc[(df['SOLUTION'] != '') & (df['SOLUTION'].notna()) & (df['COMPANY DESCRIPTION'] != '') & (df['COMPANY DESCRIPTION'].notna())]

df_reduced = df_reduced.drop_duplicates()

df_reduced.shape

(429, 2)

In [7]:
train = df_reduced[['COMPANY DESCRIPTION']]
y = df_reduced[['SOLUTION']]

# Convert the string labels in 'y' to numerical labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Convert string labels to numerical labels

##### Evaluate imbalance of Solution categories
This shows a pretty severe imbalance, with more than a dozen categories that have only 1 sample, while the largest groups have 20+. We will need to address this.

In [8]:
px.histogram(y, x = 'SOLUTION')

In [None]:
y['SOLUTION'].value_counts()

Unnamed: 0_level_0,count
SOLUTION,Unnamed: 1_level_1
Manufacturing Byproduct Utilization (Upcycling),50
Meal Kits,32
Centralized Composting,28
Centralized Anaerobic Digestion,28
Waste-Derived Biomaterials,23
Inventory Traceability,19
Insect Farming,17
Imperfect & Surplus Produce Channels,17
Precision Food Safety,16
Markdown Alert Applications,12


##### Oversample to address sparsity

To mitigate bias from the severe imbalance we found above, we apply an oversampling technique. While a method that applies a nearest-neighbors approach (e.g., SMOTE) is typically more efficient, such methods require each category to have at least 2 samples. Because that criteria is not fulfilled, we resort to a "naive" random oversampling strategy.

In [9]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(train, y_encoded)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state = 2525, test_size=0.2)

##### Pre-calculate embeddings

Because BERT and BERTopic rely on document-based embeddings (rather than simple word embeddings), it is not necessary to conduct most of the typical NLP pre-processing (e.g., stemming, lemmatizing, tokenizing, etc.) In fact, [BERTopic documentation warns that these steps can actually undermine the efficacy of the model](https://maartengr.github.io/BERTopic/faq.html#should-i-preprocess-the-data).

However, because determining the document embeddings is cost-intensive, and since we are going to be testing different hyperparameters iteratively, calculating the embeddings ahead of time will drastically speed things up.

In [11]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(list(X_train))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

##### Test supervised BERTopic model, iterating over various hyperparameter combinations

Because we are using BERTopic to do supervised training on a pre-established set of Solutions, the various parameters related to number of topics (e.g., nr_topics) or samples per topic (e.g., min_topic_size) are not relevant.

These are the hyperparameters that we will experiment with:

- N-Gram Range: The range of the number of discrete words that BERTopic will evaluate as a single token

- Top N Words: The max number of words that will be used to construct each topic.

In [42]:
X_test_newindex = X_test.reset_index(names = 'old_index')

empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

for n_gram_range_val in [(1, 1), (1, 2), (1, 3)]:
    for top_n_words_val in [5, 10, 15]:

        # Create a supervised BERTopic instance
        topic_model= BERTopic(
                embedding_model = embedding_model,
                umap_model = empty_dimensionality_model,
                hdbscan_model = clf,
                ctfidf_model = ctfidf_model,
                low_memory = True,
                top_n_words = top_n_words_val,
                n_gram_range = n_gram_range_val
        )

        # Train model for each iteration
        topics, probs = topic_model.fit_transform(X_train['COMPANY DESCRIPTION'], y = y_train)

        topic_model_out = topic_model.get_topic_info()

        # Generate and save predicted topic numbers
        all_preds = []

        for i in range(len(X_test_newindex)):
            topic = re.sub(r'\W','',str(topic_model.transform(X_test_newindex['COMPANY DESCRIPTION'].iloc[i])[0]))
            all_preds.append(topic)

        # Put old and new topic numbers in data frame
        z = pd.DataFrame()
        z['Predicted'] = all_preds
        z['Original'] = y_test

        # Create dictionaries to convert topic numbers back to topic names -- note that the predictions and original topics have different encodings, and therefore require separate dictionaries to map back to text
        topic_output = topic_model.get_topic_info()

        topic_names = []

        # Defining dictionary for predicted solutions
        for i in range(len(topic_output['Representative_Docs'])):
            doc_clean = re.sub(r'\[|\]','',topic_output['Representative_Docs'][i][1])[0:50]
            temp_df = df[['SOLUTION']].loc[df['COMPANY DESCRIPTION'].str.contains(doc_clean, regex = False)].reset_index()

            if temp_df['SOLUTION'][0] != '':
                sol_name = temp_df['SOLUTION'][0]
            else:
                sol_name = temp_df['SOLUTION'][1]

            topic_names.append(sol_name)

        topic_dictionary_pred = dict(zip(topic_output['Topic'], topic_names))

        z['Predicted Name'] = z['Predicted'].astype(int).map(topic_dictionary_pred)

        # Defining dictionary for original solutions
        topic_dictionary_orig = dict(zip(y_encoded, y['SOLUTION']))

        z['Original Name'] = z['Original'].astype(int).map(topic_dictionary_orig)

        z['Match'] = z['Original Name'] == z['Predicted Name']

        print('N-Gram Range: ' + str(n_gram_range_val) + '; Top N Words ' + str(top_n_words_val))

        # accuracy
        acc = z['Match'].value_counts()[0] / len(z)
        print('Accuracy Rate: ' + str(acc))

        all_recalls = []
        all_precs = []
        all_f1s = []

        for p in z['Predicted Name'].unique():

            # True positives
            tp = z['Match'].loc[(z['Match'] == True) & (z['Predicted Name'] == p)].value_counts()[0]

            # False negatives
            if len(z['Match'].loc[(z['Match'] == False) & (z['Original Name'] == p)].value_counts()) == 0:
                fn = 0
            else:
                fn = z['Match'].loc[(z['Match'] == False) & (z['Original Name'] == p)].value_counts()[0]

            if len(z['Match'].loc[(z['Match'] == False) & (z['Predicted Name'] == p)].value_counts()) == 0:
                fp = 0
            else:
                fp = z['Match'].loc[(z['Match'] == False) & (z['Predicted Name'] == p)].value_counts()[0]

            recall = tp / (tp + fn)
            precision = tp / (tp + fp)
            f1 = 2 * (precision * recall) / (precision + recall)

            all_recalls.append(recall)
            all_precs.append(precision)
            all_f1s.append(f1)
            print('F1 Score, ' + p + ' : ' + str(f1))

        print('Average Recall: ' + str(np.mean(all_recalls)))
        print('Average Precision: ' + str(np.mean(all_precs)))
        print('Average F1 Score: ' + str(np.mean(all_f1s)))
        print('')


N-Gram Range: (1, 1); Top N Words 5
Accuracy Rate: 0.9434782608695652
F1 Score, First Expired First Out : 1.0
F1 Score, Early Spoilage Detection (Hyperspectral Imaging) : 0.9565217391304348
F1 Score, Package Design : 1.0
F1 Score, Imperfect & Surplus Produce Channels : 0.8571428571428571
F1 Score, Enhanced Demand Planning : 1.0
F1 Score, Modified Atmosphere Packaging System : 1.0
F1 Score, Donation Value-Added Processing : 1.0
F1 Score, Inventory Traceability : 0.8750000000000001
F1 Score, Manufacturing Byproduct Utilization (Upcycling) : 0.8000000000000002
F1 Score, Assisted Distressed Sales : 0.9600000000000001
F1 Score, Centralized Composting : 0.7142857142857143
F1 Score, Waste-Derived Bio-Plastics : 1.0
F1 Score, Markdown Alert Applications : 0.9333333333333333
F1 Score, Community Composting : 0.823529411764706
F1 Score, Online Marketplace Platform : 0.875
F1 Score, Insect Farming : 0.9
F1 Score, K-12 Education Campaigns : 1.0
F1 Score, Intelligent Routing : 1.0
F1 Score, Gleaning

All of the models performed identically, achieving 94.3% accuracy and an F1 score of .94 out-of-sample.

##### Select and train final model
In this case, all of the models performed identically on the test set. That suggests the exact selection of hyperparameters has little effect; we will therefore revert to the defaults.

In [14]:
topic_model = BERTopic(
                umap_model = empty_dimensionality_model,
                hdbscan_model = clf,
                ctfidf_model = ctfidf_model,
                low_memory = True
        )

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(list(X_resampled))

In [15]:
# Train model
topics, probs = topic_model.fit_transform(X_resampled['COMPANY DESCRIPTION'], y = y_resampled)

##### Create dictionary to re-map encoded solutions back to names

In order to which BERTopic-determined topic numbers correspond to which actual topic names, we have to reverse engineer them.

In [16]:
topic_output = topic_model.get_topic_info()

topic_names = []

for i in range(len(topic_output['Representative_Docs'])):
    doc_clean = re.sub(r'\[|\]','',topic_output['Representative_Docs'][i][1])[0:50]
    temp_df = df[['SOLUTION']].loc[df['COMPANY DESCRIPTION'].str.contains(doc_clean, regex = False)].reset_index()

    if temp_df['SOLUTION'][0] != '':
        sol_name = temp_df['SOLUTION'][0]
    else:
        sol_name = temp_df['SOLUTION'][1]

    topic_names.append(sol_name)

topic_dictionary = dict(zip(topic_output['Topic'], topic_names))

##### Predict solution categories for full dataset

In [17]:
# Take subset with non-missing Company Description
df_new = df.loc[(df['COMPANY DESCRIPTION'].notna())].reset_index()

solution = df['SOLUTION'].loc[(df['COMPANY DESCRIPTION'].notna())]
sol_encoded = le.fit_transform(solution)

# Init empty list of predictions
all_preds = []

for i in range(len(df_new)):
    topic = re.sub(r'\W','',str(topic_model.transform(df_new['COMPANY DESCRIPTION'][i])[0]))
    all_preds.append(topic)

df_new['Predicted Solution Number'] = all_preds

In [18]:
# Map topics using dictionary
df_new['Predicted Solution'] = df_new['Predicted Solution Number'].astype(int).map(topic_dictionary)

In [19]:
df_new.head()

1,level_0,index,DATE,RECIPIENT,DEAL SIZE,FUNDER(S),DEAL DESCRIPTION,COMPANY DESCRIPTION,SOLUTION TYPE(S),SOLUTION,CAPITAL TYPE,FUNDING GROUP,INVESTMENT TYPE,DEAL STAGE,APPLICABILITY,Predicted Solution Number,Predicted Solution
0,0,2,2-15-2025,Chomp,n.a.,Martin Energy Group,The company was acquired by Martin Energy Grou...,"Manufacturer of patented, and containerized an...",Recycling,,Private,Corporate Finance & Spending,Merger/Acquisition,,Direct,6,Centralized Anaerobic Digestion
1,1,3,1-28-2025,Knead,"$553,171",Business Development Bank of Canada\nGrowthX C...,"The company raised CAD 800,000 of seed funding...",Operator of a food recovery technology platfor...,Rescue,Donation Coordination & Matching,Private,Venture Capital,Seed Round,Seed Round,Direct,0,Donation Coordination & Matching
2,2,4,1-6-2025,Dyrt,"$4,711,998",n.a.,The company raised $4.71 million through a com...,Developer of an organic waste diversion platfo...,Recycling,,Private,Venture Capital,Seed Round,Seed Round,Direct,36,Community Composting
3,3,5,1-1-2025,Earnest,"$1,200,000",Ag Ventures Alliance\nAgLaunch\nSOSV,The company raised $1.2 million of venture fun...,Developer of an agricultural system and additi...,Recycling,Waste-Derived Agricultural Inputs,Private,Venture Capital,Later Stage VC,,Direct,12,Waste-Derived Agricultural Inputs
4,4,6,12-16-2024,Brightly,"$2,300,000",Clear Current Capital\nCollaborative Fund\nG-F...,The company raised $2.3 million of seed fundin...,Operator of a food recovery company intended t...,Rescue,,Private,Venture Capital,Seed Round,Seed Round,Direct,17,Donation Transportation


##### Evaluate in-sample performance

In [29]:
df_new.columns

Index(['level_0', 'index', 'DATE', 'RECIPIENT', 'DEAL SIZE', 'FUNDER(S)',
       'DEAL DESCRIPTION', 'COMPANY DESCRIPTION', 'SOLUTION TYPE(S)',
       'SOLUTION', 'CAPITAL TYPE', 'FUNDING GROUP', 'INVESTMENT TYPE',
       'DEAL STAGE', 'APPLICABILITY', 'Predicted Solution Number',
       'Predicted Solution', 'MATCH'],
      dtype='object', name=1)

In [38]:
# Generate match dummy
df_new['MATCH'] = (df_new['SOLUTION'] == df_new['Predicted Solution'])

# Overall accuracy
# Generate match dummy
df_new['MATCH'] = (df_new['SOLUTION'] == df_new['Predicted Solution'])

# Overall accuracy
match_results = df_new['MATCH'].loc[(df_new['COMPANY DESCRIPTION'].notna()) & (df_new['COMPANY DESCRIPTION'] != '') & (df_new['SOLUTION'].notna()) & (df_new['SOLUTION'] != '')].value_counts()

print('Overall In-Sample Accuracy Rate: ' + str(match_results[0] / (match_results[0] + match_results[1])))

all_recalls = []
all_precs = []
all_f1s = []

for p in df_new['Predicted Solution'].unique():

    # True positives
    tp = df_new['MATCH'].loc[(df_new['MATCH'] == True) & (df_new['Predicted Solution'] == p)].value_counts()[0]

    # False negatives
    if len(df_new['MATCH'].loc[(df_new['MATCH'] == False) & (df_new['SOLUTION'] == p)].value_counts()) == 0:
        fn = 0
    else:
        fn = df_new['MATCH'].loc[(df_new['MATCH'] == False) & (df_new['SOLUTION'] == p)].value_counts()[0]

    if len(df_new['MATCH'].loc[(df_new['MATCH'] == False) & (df_new['Predicted Solution'] == p)].value_counts()) == 0:
        fp = 0
    else:
        fp = df_new['MATCH'].loc[(df_new['MATCH'] == False) & (df_new['Predicted Solution'] == p)].value_counts()[0]

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * recall) / (precision + recall)

    all_recalls.append(recall)
    all_precs.append(precision)
    all_f1s.append(f1)

print('Average Recall: ' + str(np.mean(all_recalls)))
print('Average Precision: ' + str(np.mean(all_precs)))
print('Average F1 Score: ' + str(np.mean(all_f1s)))

Overall In-Sample Accuracy Rate: 0.9241494701617401
Average Recall: 0.9477764993833466
Average Precision: 0.7245536425761144
Average F1 Score: 0.787057307341232


In [None]:
df_new[['RECIPIENT', 'COMPANY DESCRIPTION', 'SOLUTION', 'Predicted Solution']].to_csv('df_new.csv')

##### Visualize the topic embeddings in a 2D space
This shows substantial overlap among many of the topics, which is likely driving much of the mislassification we see.

In [None]:
topic_model.visualize_topics()

## 3. Plotting deal size by `Solution` category

##### Now we will use Solution category (the real one where available, or the predicted where none was originally listed) to create a visualization plotting investment size against solution type.

In [None]:
# Convert deal size to integer
df_new['DEAL_INT'] = df_new['DEAL SIZE'].str.replace(',', '').str.extract(r'([0-9]+)', expand = False).astype(float)

# Coalesce to actual Solution category where available; otherwise use predicted
df_new['SOLUTION_COALESCE'] = df_new['Predicted Solution'].combine_first(df_new['SOLUTION'])

# Replace empties with 'Unknown'
df_new['SOLUTION_COALESCE'].loc[df_new['SOLUTION_COALESCE'] == ''] = 'Unknown'

In [None]:
df_new.SOLUTION_COALESCE.value_counts()

Unnamed: 0_level_0,count
SOLUTION_COALESCE,Unnamed: 1_level_1
Gleaning,2270
Donation Transportation,1094
Manufacturing Byproduct Utilization (Upcycling),298
Meal Kits,169
Donation Coordination & Matching,164
Waste-Derived Biomaterials,100
Centralized Composting,95
Temperature Monitoring (Pallet Transport),90
Insect Farming,89
Waste-Derived Agricultural Inputs,79


In [None]:
df_new.head()

1,level_0,index,DATE,RECIPIENT,DEAL SIZE,FUNDER(S),DEAL DESCRIPTION,COMPANY DESCRIPTION,SOLUTION TYPE(S),SOLUTION,...,FUNDING GROUP,INVESTMENT TYPE,DEAL STAGE,APPLICABILITY,Predicted Solution Number,Predicted Solution,MATCH,DEAL_INT,SOLUTION_COALESCE,IS_ORIG
0,0,2,2-15-2025,Chomp,n.a.,Martin Energy Group,The company was acquired by Martin Energy Grou...,"Manufacturer of patented, and containerized an...",Recycling,,...,Corporate Finance & Spending,Merger/Acquisition,,Direct,6,Centralized Anaerobic Digestion,False,,Centralized Anaerobic Digestion,Actual
1,1,3,1-28-2025,Knead,"$553,171",Business Development Bank of Canada\nGrowthX C...,"The company raised CAD 800,000 of seed funding...",Operator of a food recovery technology platfor...,Rescue,Donation Coordination & Matching,...,Venture Capital,Seed Round,Seed Round,Direct,0,Donation Coordination & Matching,True,553171.0,Donation Coordination & Matching,Actual
2,2,4,1-6-2025,Dyrt,"$4,711,998",n.a.,The company raised $4.71 million through a com...,Developer of an organic waste diversion platfo...,Recycling,,...,Venture Capital,Seed Round,Seed Round,Direct,36,Community Composting,False,4711998.0,Community Composting,Actual
3,3,5,1-1-2025,Earnest,"$1,200,000",Ag Ventures Alliance\nAgLaunch\nSOSV,The company raised $1.2 million of venture fun...,Developer of an agricultural system and additi...,Recycling,Waste-Derived Agricultural Inputs,...,Venture Capital,Later Stage VC,,Direct,12,Waste-Derived Agricultural Inputs,True,1200000.0,Waste-Derived Agricultural Inputs,Actual
4,4,6,12-16-2024,Brightly,"$2,300,000",Clear Current Capital\nCollaborative Fund\nG-F...,The company raised $2.3 million of seed fundin...,Operator of a food recovery company intended t...,Rescue,,...,Venture Capital,Seed Round,Seed Round,Direct,17,Donation Transportation,False,2300000.0,Donation Transportation,Actual


In [None]:
df_new['IS_ORIG'] = np.where(df_new['SOLUTION'] == '', 'Predicted', 'Actual')

df_new['DEAL_INT'] = df_new['DEAL SIZE'].str.replace(',', '').str.extract(r'([0-9]+)', expand = False).astype(float)

fig = px.strip(df_new,
               x='DEAL_INT',
               y='SOLUTION_COALESCE',
               log_x = True,
               stripmode='group',
               custom_data = ['RECIPIENT', 'IS_ORIG'],
               color = 'SOLUTION_COALESCE',
               color_discrete_sequence = px.colors.qualitative.Safe
               )

fig.update_traces(hovertemplate = "<br>".join(["Deal Size: %{x}",
                                               "Recipient: %{customdata[0]}",
                                               "Solution Type: %{y}",
                                               "Solution Type Source: %{customdata[1]}",
    ]),
                  jitter = 1.0,
                  marker = {'size': 10,
                            'line' : {'width': 1,
                                      'color': 'rgba(128, 128, 128, 1)'}}
                  )

fig.update_layout(
                title={
                    'text': 'Deal Size by Solution Category',
                    'x': 0.5,
                    'xanchor': 'center',
                    'y': 1,
                    'yanchor': 'top',
                    'font': {
                        'family': 'Arial',
                        'size': 24,
                        'color': 'grey'
                    }
                },
                  xaxis_title = 'Deal Size ($)',
                  yaxis_title = '',
                  paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  bargap = 1,
                  showlegend = False)

fig.write_html('fig.html')

In [None]:
fig.show()

## 4. Searching for Similar Capital Flows

##### Finally, we will build a semantic similarity model that looks for texts that are similar ReFED's descriptions of companies and investments. We'll deploy this model against against data from APIs and RSS feeds from various news/PR sources, in order to find news stories that may describe similar investments that might be of interest to ReFED.

In [1]:
# Init empty lists
all_content = []
all_headlines = []
all_links = []

##### Define a function that calls and rolls up APIs/RSS feeds from several news/press release services
- NewsAPI: This is a news aggregator service that pulls stories and metadata from thousands of international sources and aggregates them into its API. (Note: The current code uses the free version, which limits the number of results that can be pulled via API.)
- PR Newswire: This is a newswire service that publishes press releases from companies, governments, academic institutions, nonprofits, etc. It maintains separate RSS feeds for several topic areas of potential interest -- we will need to call them all separately.

(Note that if a tool like this would be worthwhile to ReFED, the organization might consider paid subscriptions to additional feeds/APIs from organizations that specialize in food/ag science/biotech. The present APIs were included mainly to provide proof of concept.)

In [2]:
def get_news():

    # NewsAPI
    headers = {'x-api-key': 'b2b12189dc8443bebddee191ee64d95c'}
    response_newsapi = requests.get("https://newsapi.org/v2/everything?q=(agriculture OR biotech OR food OR farm) AND (rescue OR waste) AND (venture OR investment OR acquires OR acquired OR 'seed funding' OR funds OR merger OR 'angel investor' OR incubator OR accelerator VC OR buyout)&language=en&sortBy=publishedAt",
                            headers=headers)

    # For each article, grab content, headline, and URL
    for i in range(len(response_newsapi.json()['articles'])):

        content = response_newsapi.json()['articles'][i]['content']
        all_content.append(content)

        headline = response_newsapi.json()['articles'][i]['title']
        all_headlines.append(headline)

        url = response_newsapi.json()['articles'][i]['url']
        all_links.append(url)

    # List of PR Newswire's topic-specific RSS feeds
    response_env = requests.get("https://www.prnewswire.com/rss/environment-latest-news/environment-latest-news-list.rss")
    response_health = requests.get("https://www.prnewswire.com/rss/health-latest-news/health-latest-news-list.rss")
    response_policy = requests.get("https://www.prnewswire.com/rss/policy-public-interest-latest-news/policy-public-interest-latest-news-list.rss")
    response_tech = requests.get("https://www.prnewswire.com/rss/consumer-technology-latest-news/consumer-technology-latest-news-list.rss")

    responses = [response_env, response_health, response_policy, response_tech]

    # For each press release, grab content, headline, and URL
    for r in responses:
        soup = BeautifulSoup(r.content)

        items = soup.find_all('description')

        for item in items[1:21]: # Capturing the 20 news items that appear in each feed, while ignoring the first item which is a header
            content = item.text
            all_content.append(content)

        headlines = soup.find_all('title')

        for hd in headlines[1:21]:
            headline = hd.text
            all_headlines.append(headline)

        links = soup.find_all('guid')

        for u in links:
            url = u.text
            all_links.append(url)

    # Aggregate all items from all news/PR feeds
    newsfeeds_new = pd.DataFrame({'Headline':all_headlines, 'Link':all_links, 'Content':all_content})

    # Save to parquet
    newsfeeds_new.to_parquet('newsfeeds' + str(datetime.datetime.now()) + '.parquet')

    return newsfeeds_new

##### Automate hourly pulls from all of the news feeds

In [3]:
# import schedule
# schedule.every(1).hour.do(get_news)

# while True:
#    schedule.run_pending()
#    time.sleep(1)

##### Define the transformer model that we'll use to encode

In [4]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

##### Pull and aggregate all of the files from previous API/RSS pulls

In [None]:
 filelist = []

newsfeeds = get_news()

for file in glob.glob("prnewswire_rss*.parquet|newsfeeds*.parquet"):
    temp = pd.read_parquet(file)
    newsfeeds = pd.concat([newsfeeds, temp]).drop_duplicates

##### Calculate semantic similarity for each news article/press release

In [13]:
# Create column containing concatenation of `COMPANY DESCRIPTION` and `DEAL DESCRIPTION`
df['FULL_EMBED'] = df['COMPANY DESCRIPTION'] + '' + df['DEAL DESCRIPTION']

# Encode the concatenated column and the news articles
desc_embed = model.encode(df['FULL_EMBED'])
articles_embed = model.encode(newsfeeds['Content'].to_numpy())

# Calculate semantic similarity (using cosine similarity)
similarity = util.cos_sim(articles_embed, desc_embed)

In [14]:
# Append scores to df of news/press releases
scores = []
for i in range(newsfeeds.shape[0]):
    scores.append(abs(similarity[i]).max())

newsfeeds['Similarity_Score'] = scores

newsfeeds.shape

(896, 4)

In [15]:
# Sort so that highest-similarity articles appear first
newsfeeds.sort_values(['Similarity_Score'], ascending = False).drop_duplicates('Headline').head(20)


Unnamed: 0,Headline,Link,Content,Similarity_Score
353,"TRAiNED, Inc. Launches Investment Campaign on ...",https://www.prnewswire.com/news-releases/train...,"PITTSBURGH, March 2, 2025 /PRNewswire/ -- TRAi...",tensor(0.5861)
140,Ontario 'grocery outlet' sells food for a lot ...,https://www.blogto.com/eat_drink/2025/02/ontar...,With 13 locations across Ontario (and more to ...,tensor(0.5687)
93,"USAID Funded Pet Apps, Fashion Companies, Desi...",https://www.dailysignal.com/2025/02/26/usaid-f...,The U.S. Agency for International Development ...,tensor(0.5620)
866,Our Rescue Expands Partnership with Shakti Sam...,https://www.prnewswire.com/news-releases/our-r...,Anti-trafficking Allies and Supporters Invited...,tensor(0.5551)
207,India-EU Hydrogen Partnership Flagship Project...,https://cleantechnica.com/2025/02/28/india-eu-...,Sign up for daily news updates from CleanTechn...,tensor(0.5375)
417,Indonesia pushes use of tech to process waste ...,https://en.antaranews.com/news/346597/indonesi...,Jakarta (ANTARA) - Indonesia's Coordinating Mi...,tensor(0.5319)
265,The line in the sand,https://www.americanthinker.com/blog/2025/02/t...,"In a nation of laws, borders, and shared burde...",tensor(0.5309)
464,Alternative Energy and Conservation Pioneer Ro...,https://www.prnewswire.com/news-releases/alter...,"ST. LOUIS, Feb. 28, 2025 /PRNewswire/ -- Roesl...",tensor(0.5226)
860,The Folded Flag Foundation Scholarship Applica...,https://www.prnewswire.com/news-releases/the-f...,"LAS VEGAS, March 1, 2025 /PRNewswire/ -- The F...",tensor(0.5160)
494,Fapon Biopharma annonce l'approbation par la F...,https://www.prnewswire.com/news-releases/fapon...,"DONGGUAN, Chine, 2er mars 2025 /PRNewswire/ --...",tensor(0.5132)


In [16]:
newsfeeds.sort_values(['Similarity_Score'], ascending = False).drop_duplicates('Headline').to_csv('scores.csv')