# Data Preprocessing

# Setup

In [43]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
colours = sns.color_palette("Set2")
from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings("ignore")

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)
from utils import get_table_from_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

----

# Retrieving Data from Supabase and Building Dataframes

In [37]:
#get tables and build dataframes
funders_df = get_table_from_supabase(url, key, "funders")
grants_df = get_table_from_supabase(url, key, "grants")

#get recipients with filter
recipients_df = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)

In [44]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

#create checkpoint - save dfs to pickle
# recipients_df.to_pickle(checkpoint_folder / "recipients_df.pkl")
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")

----

# Retreiving Data from Checkpoints

In [None]:
recipients_df = pd.read_pickle(checkpoint_folder / "recipients_df.pkl")
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")

-----

# Feature Engineering

I will create embeddings for the relevant columns. I have chosen to run each dataframe one at a time to separate these time- and compute-heavy processes.

In [45]:
model = SentenceTransformer("all-roberta-large-v1")

## Embedding Creation

In [46]:
funders_cols = ["name", "activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create embeddings
for col in funders_cols:
    #replace nans with empty string
    texts = funders_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    funders_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

print(f"All embeddings created for funders_df!")

name embedded successfully (996, 1024)
activities embedded successfully (996, 1024)
objectives embedded successfully (996, 1024)
objectives_activities embedded successfully (996, 1024)
achievements_performance embedded successfully (996, 1024)
grant_policy embedded successfully (996, 1024)
All embeddings created for funders_df!


In [49]:
recipients_cols = ["recipient_name", "recipient_activities", "recipient_objectives"]

#create embeddings
for col in recipients_cols:
    #replace nans with empty string
    texts = recipients_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    recipients_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

print(f"All embeddings created for recipients_df!")

recipient_name embedded successfully (17169, 1024)
recipient_activities embedded successfully (17169, 1024)
recipient_objectives embedded successfully (17169, 1024)
All embeddings created for recipients_df!


In [51]:
grants_cols = ["grant_title", "grant_desc"]

#create embeddings
for col in grants_cols:
    #replace nans with empty string
    texts = grants_df[col].fillna("").tolist()
    embeddings = model.encode(texts)
    
    #add to df
    grants_df[f"{col}_em"] = list(embeddings)
    
    print(f"{col} embedded successfully {embeddings.shape}")

print(f"All embeddings created for grants_df!")

grant_title embedded successfully (32816, 1024)
grant_desc embedded successfully (32816, 1024)
All embeddings created for grants_df!


In [52]:
#get checkpoint folder
checkpoint_folder = Path("./8.1_checkpoints/")

#create checkpoint - save dfs to pickle
recipients_df.to_pickle(checkpoint_folder / "recipients_df_em.pkl")
funders_df.to_pickle(checkpoint_folder / "funders_df_em.pkl")
grants_df.to_pickle(checkpoint_folder / "grants_df_em.pkl")