In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Define the project root as the directory containing the notebook, or adjust as needed
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Define the path to the JSON file relative to the project root
json_file_path = os.path.join(PROJECT_ROOT, 'data', 'raw', 'combined_organizations.json')

# Load the JSON file into a pandas DataFrame
def load_json_as_dataframe(file_path):
    return pd.read_json(file_path, orient='records')

# Load the combined_organizations.json file
combined_organizations = load_json_as_dataframe(json_file_path)

# Now, combined_organizations_df contains the loaded data in a pandas DataFrame
print(combined_organizations.info())  # This will give you an overview of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451528 entries, 0 to 451527
Data columns (total 35 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   SME                 382461 non-null  object 
 1   active              382461 non-null  object 
 2   activityType        382461 non-null  object 
 3   city                382461 non-null  object 
 4   contactForm         382461 non-null  object 
 5   contentUpdateDate   382461 non-null  object 
 6   country             382461 non-null  object 
 7   ecContribution      382461 non-null  object 
 8   endOfParticipation  382461 non-null  object 
 9   geolocation         382461 non-null  object 
 10  name                451528 non-null  object 
 11  netEcContribution   382461 non-null  object 
 12  nutsCode            382461 non-null  object 
 13  order               382461 non-null  float64
 14  organisationID      382461 non-null  object 
 15  organizationURL     382461 non-nul

In [6]:
# Filter for UK organizations and extract unique organization names
unique_uk_organizations = combined_organizations[combined_organizations['country'] == 'UK']['name'].unique()

# Check the number of unique organization names
print(f"Number of unique organizations in the UK: {len(unique_uk_organizations)}")


Number of unique organizations in the UK: 5422


In [7]:
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Initialize the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to create and save embeddings for unique organization names
def create_and_save_embeddings(names, model, output_file='uk_unique_org_embeddings.npy'):
    tqdm.pandas(desc="Generating embeddings")  # Enable progress bar with description
    # Generate embeddings with progress bar and save them
    embeddings = [model.encode(str(name)) for name in tqdm(names, desc="Generating embeddings", unit="org")]
    embeddings = np.array(embeddings)
    np.save(output_file, embeddings)  # Save embeddings to a file
    return embeddings

# Generate and save embeddings for unique UK organization names
uk_unique_org_embeddings = create_and_save_embeddings(unique_uk_organizations, model, 'uk_unique_org_embeddings.npy')

# Check the first few embeddings
print(uk_unique_org_embeddings[:5])


Generating embeddings:   8%|▊         | 443/5422 [00:31<10:38,  7.79org/s]

In [None]:
# # Load previously saved embeddings
# uk_unique_org_embeddings = np.load('uk_unique_org_embeddings.npy')

# # Ensure that the embeddings are correctly loaded
# print(f"Loaded embeddings shape: {uk_unique_org_embeddings.shape}")
# print(uk_unique_org_embeddings[:5])
