In [1]:
import json
import pandas as pd
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
# # Download the WordNet corpus
# nltk.download('wordnet')


  from tqdm.autonotebook import tqdm, trange


In [2]:
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm  # Progress bar for monitoring

# Define your evidence query
evidence_query = "GHG Emission Regulation"

# Define the year range to filter
start_year = 2013
end_year = 2023

# File paths
jsonl_file_path = r'C:\Users\hoath\Git\LobbyMap_ML\data\processed\combined.jsonl'
csv_file_path = r'C:\Users\hoath\Git\LobbyMap_ML\data\processed\company_sector_region.csv'

# Load the company sector and region data
company_info = pd.read_csv(csv_file_path)
company_info_dict = company_info.set_index('company_name').to_dict(orient='index')

# Initialize lists to collect text data, years, sectors, and regions
text_data = []
years = []
sectors = []
regions = []

# Read the JSONL file and extract necessary fields with a progress bar
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
    for line in tqdm(file, total=14000):  # Update total based on actual size
        entry = json.loads(line)
        document_id = entry.get('document_id')
        sentences = entry.get('sentences', [])
        meta_evidences = entry.get('meta', {}).get('evidences', [])

        for evidence in meta_evidences:
            if isinstance(evidence, list):
                for sub_evidence in evidence:
                    company_name = sub_evidence.get('company_name')
                    evidence_year = sub_evidence.get('evidence_year')
                    evidence_query_field = sub_evidence.get('evidence_query')

                    # Filter by the target evidence query and year range
                    if evidence_query_field == evidence_query and start_year <= evidence_year <= end_year:
                        for sentence in sentences:
                            text = sentence['text']
                            if isinstance(text, str) and text.strip():
                                text_data.append(text)
                                years.append(evidence_year)

                                if company_name in company_info_dict:
                                    sector = company_info_dict[company_name]['sector']
                                    region = company_info_dict[company_name]['region']
                                else:
                                    sector = "Unknown"
                                    region = "Unknown"

                                sectors.append(sector)
                                regions.append(region)
            elif isinstance(evidence, dict):
                company_name = evidence.get('company_name')
                evidence_year = evidence.get('evidence_year')
                evidence_query_field = evidence.get('evidence_query')

                # Filter by the target evidence query and year range
                if evidence_query_field == evidence_query and start_year <= evidence_year <= end_year:
                    for sentence in sentences:
                        text = sentence['text']
                        if isinstance(text, str) and text.strip():
                            text_data.append(text)
                            years.append(evidence_year)

                            if company_name in company_info_dict:
                                sector = company_info_dict[company_name]['sector']
                                region = company_info_dict[company_name]['region']
                            else:
                                sector = "Unknown"
                                region = "Unknown"

                            sectors.append(sector)
                            regions.append(region)

# Ensure all elements are strings and remove any that are not
text_data = [str(text) for text in text_data if isinstance(text, str) and text.strip()]

# Check if all lists have the same length
assert len(text_data) == len(years) == len(sectors) == len(regions), "Mismatch between text data, years, sectors, and regions"

# Print the dimensions and first 5 entries of text_data for debugging
print(f"Dimensions of text_data: {len(text_data)}")
print("First 5 entries in text_data:", text_data[:5])
print("First 5 entries in years:", years[:5])
print("First 5 entries in sectors:", sectors[:5])
print("First 5 entries in regions:", regions[:5])




 76%|███████▌  | 10604/14000 [00:09<00:03, 1072.02it/s]


Dimensions of text_data: 557275
First 5 entries in text_data: ['  1', 'COMMENTS OF THE CLASS OF ’85 REGULATORY RESPONSE GROUP', 'ON THE', 'PROPOSED STANDARDS OF PERFORMANCE FOR NEW, RECONSTRUCTED, AND', 'MODIFIED SOURCES AND EMISSIONS GUIDELINES FOR EXISTING SOURCES:']
First 5 entries in years: [2022, 2022, 2022, 2022, 2022]
First 5 entries in sectors: ['Electric Utilities', 'Electric Utilities', 'Electric Utilities', 'Electric Utilities', 'Electric Utilities']
First 5 entries in regions: ['North America', 'North America', 'North America', 'North America', 'North America']


In [3]:
# Check the unique years/sectors/regions in the dataset
unique_years = sorted(set(years))
print(f"Unique years in the dataset: {unique_years}")

unique_sector = sorted(set(sectors))
print(f"Unique sectors in the dataset: {unique_sector}")

unique_region = sorted(set(regions))
print(f"Unique years in the dataset: {unique_region}")


Unique years in the dataset: [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
Unique sectors in the dataset: ['Airlines', 'Automobiles', 'Chemicals', 'Consumer Goods & Services', 'Diversified Mining', 'Electric Utilities', 'Oil & Gas', 'Oil & Gas Distribution', 'Other transportation', 'Steel', 'Unknown', 'cement', 'other industrials', 'paper', 'shipping']
Unique years in the dataset: ['Africa', 'Asia', 'Australasia', 'Europe', 'Middle East', 'North America', 'South America', 'Unknown']


In [4]:
import re
import unicodedata
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Normalize text (remove accents)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    
    # Convert text to lowercase
    text = text.lower()

    # Remove underscores, commas, and replace multiple spaces with a single space
    text = re.sub(r'[_,]+', ' ', text)  # Replace underscores and multiple commas with a single space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space

    # Remove numerical tokens
    text = re.sub(r'\b\d+\b', '', text)

    # Remove any remaining special characters, including punctuation
    text = re.sub(r'[^\w\s]', '', text)  # Remove all characters except words and spaces

    # Remove specific domain-related stopwords
    domain_stopwords = set(["reg", "fig", "data", "figure", "page", "additional"])
    text = ' '.join([word for word in text.split() if word not in domain_stopwords])

    # Lemmatize each word in the text
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    # Remove short words (less than 3 characters) and strip extra spaces
    text = ' '.join([word for word in text.split() if len(word) > 2])

    return text

# Apply preprocessing to each entry in text_data
text_data = [preprocess_text(doc) for doc in text_data]

# Remove any entries that are empty or consist only of whitespace after preprocessing
text_data = [doc for doc in text_data if doc.strip()]




In [19]:
print(f"Dimensions of text_data: {len(text_data)}")

Dimensions of text_data: 478196


In [17]:
from sentence_transformers import SentenceTransformer
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bar

# Define a custom dataset class for SentenceTransformer
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# Load the SentenceTransformer model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Explicitly set the device to CPU
device = "cpu"

# Create a DataLoader for batch processing
batch_size = 16  # Reduce batch size to avoid memory issues
num_workers = 0  # Number of CPU cores to use for data loading


dataset = TextDataset(text_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

# Debugging: Ensure that DataLoader is working properly
print("DataLoader length:", len(dataloader))

# Generate embeddings in batches with a progress bar
embeddings = []
for batch in tqdm(dataloader, desc="Generating Embeddings", unit="batch"):
    # Ensure batch is processed on the CPU
    batch_embeddings = embedding_model.encode(batch, convert_to_tensor=True, device=device)
    embeddings.append(batch_embeddings)

    # Debugging: Print after each batch to ensure loop is running
    #print(f"Processed batch size: {len(batch_embeddings)}")

# Concatenate all the embeddings into one tensor
embeddings = torch.cat(embeddings, dim=0)

# Convert back to NumPy array if needed
embeddings = embeddings.cpu().numpy()

# Step 2: Save the embeddings to a .npy file
np.save(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_full.npy', embeddings)
print("Embeddings saved successfully.")


DataLoader length: 29888


Generating Embeddings: 100%|██████████| 29888/29888 [1:17:40<00:00,  6.41batch/s]


Embeddings saved successfully.


In [18]:
import numpy as np

# Load the embeddings
embeddings = np.load(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_full.npy')

# Check the shape of the embeddings
print("Shape of embeddings:", embeddings.shape)


Shape of embeddings: (478196, 384)


In [None]:
import numpy as np

# # Step 2: Save the embeddings to a .npy file
# np.save('C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG.npy', embeddings)
# print("Embeddings saved successfully.")

# # Step 3: Load the embeddings when needed
# loaded_embeddings = np.load('C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG.npy')
# print(f"Loaded embeddings shape: {loaded_embeddings.shape}")

In [20]:
import numpy as np
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from stopwordsiso import stopwords  # Import stopwords-iso
from sentence_transformers import SentenceTransformer

# Load the precomputed embeddings
embeddings = np.load(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_full.npy')

# Load the SentenceTransformer model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Define UMAP and HDBSCAN models for dimensionality reduction and clustering
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Define the ClassTfidfTransformer with both recommended parameters
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# Define the Maximal Marginal Relevance (MMR) model for topic representation
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# Define the KeyBERTInspired model for further refinement
keybert_model = KeyBERTInspired()

# Chain the MMR and KeyBERTInspired models together
representation_model = [mmr_model, keybert_model]

# Initialize BERTopic model with UMAP, HDBSCAN, customized ClassTfidfTransformer, and combined representation models
topic_model = BERTopic(umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       embedding_model=embedding_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model)

# Ensure text_data is a list of strings
if not isinstance(text_data, list):
    text_data = list(text_data)

# Fit the BERTopic model on the precomputed embeddings and text data
topics, probs = topic_model.fit_transform(text_data, embeddings=embeddings)

# Check if representative documents are being generated
topic_info = topic_model.get_topic_info()
print(topic_info.head(20))

# Save the updated topic model for future use
# topic_model.save(r"C:\Users\hoath\Git\LobbyMap_ML\Models\topic_model_GHG_4",
#                  serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)

# Save the model with default pickle serialization
topic_model.save(r"C:\Users\hoath\Git\LobbyMap_ML\Models\topic_model_GHG_full.pkl",
                 save_ctfidf=True, save_embedding_model=embedding_model)









    Topic   Count                                               Name  \
0      -1  196966    -1_hydrocarbon_sectoral_economywide_enforceable   
1       0    3282  0_emissionsintensive_emissionsintensity_emissi...   
2       1    2981  1_methanevoc_methaneemitting_methanecra_methan...   
3       2    2735      2_epag_epadesignated_epaadministered_approves   
4       3    2501                   3_bmwgroup_bmws_bmwblog_munichde   
5       4    2201                     4_airlinei_boeing_flight_gojet   
6       5    1773                   5_gaslog_gasbydesign_barnett_bpu   
7       6    1754   6_regulacion_regulados_operaciones_disposiciones   
8       7    1414                        7_der_deutschland_ein_neuen   
9       8    1339        8_energetiques_ressources_etre_biodiversite   
10      9    1199       9_charger_chargepoint_chargepoints_chargenow   
11     10    1029                      10_ngcc_ngccs_ngcts_ngcchours   
12     11    1026   11_recycling_recycled_wastetoenergy_recyclin

In [21]:
# To fine-tune further here XXXX

import numpy as np
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from stopwordsiso import stopwords  # Import stopwords-iso
from sentence_transformers import SentenceTransformer

# Load the topic model
# Load the model
topic_model = BERTopic.load(r"C:\Users\hoath\Git\LobbyMap_ML\Models\topic_model_GHG_full.pkl",
                            embedding_model=embedding_model)

# Check if representative documents are being generated
topic_info = topic_model.get_topic_info()
print(topic_info.head(20))

# Define languages for which you want to include stopwords (those with at least 10 documents in column 2)
languages = [
    'af', 'ca', 'cy', 'da', 'de', 'en', 'et', 'es', 'fi', 'fr', 'hr', 
    'hu', 'id', 'it', 'nl', 'no', 'ro', 'pt', 'pl', 'tl', 'vi', 'sv', 
    'sl', 'so', 'sw', 'sq', 'lt', 'tr', 'sk'
]

# Create a combined stopwords list from the selected languages
multilingual_stopwords = set()
for lang in languages:
    multilingual_stopwords.update(stopwords(lang))

# Define additional stopwords based on your analysis
additional_stopwords = {
    "14", "50", "70", "142", "report", "combined", "statements", "corporate", 
    "financial", "results", "services", "emissions", "climate", "action", 
    "agreement", "protection", "____", " ", "",  
    "activities",  "individual", "units", "source", "vehicle", 
    "22", "23", "26", "2022", "30", "40", "25", "33", "12", "13", "15", "U.S.C.", 
    "Reg.", "_", "__", "page", "additional", "dow", "edf", "developpement", "bmw"
}

# Add the additional stopwords to the multilingual stopwords set
multilingual_stopwords.update(additional_stopwords)

# Convert multilingual_stopwords set back to a list for use in CountVectorizer
multilingual_stopwords = list(multilingual_stopwords)

# Fine-tune topic representations with a custom vectorizer
vectorizer_model = CountVectorizer(stop_words=multilingual_stopwords, ngram_range=(1, 3), min_df=20)
topic_model.update_topics(text_data, vectorizer_model=vectorizer_model)



    Topic   Count                                               Name  \
0      -1  196966    -1_hydrocarbon_sectoral_economywide_enforceable   
1       0    3282  0_emissionsintensive_emissionsintensity_emissi...   
2       1    2981  1_methanevoc_methaneemitting_methanecra_methan...   
3       2    2735      2_epag_epadesignated_epaadministered_approves   
4       3    2501                   3_bmwgroup_bmws_bmwblog_munichde   
5       4    2201                     4_airlinei_boeing_flight_gojet   
6       5    1773                   5_gaslog_gasbydesign_barnett_bpu   
7       6    1754   6_regulacion_regulados_operaciones_disposiciones   
8       7    1414                        7_der_deutschland_ein_neuen   
9       8    1339        8_energetiques_ressources_etre_biodiversite   
10      9    1199       9_charger_chargepoint_chargepoints_chargenow   
11     10    1029                      10_ngcc_ngccs_ngcts_ngcchours   
12     11    1026   11_recycling_recycled_wastetoenergy_recyclin

In [22]:
# Check if representative documents are being generated
topic_info = topic_model.get_topic_info()
print(topic_info.head(20))

    Topic   Count                                               Name  \
0      -1  196966                 -1_fuel_electricity_coal_renewable   
1       0    3282  0_emission rate_emission reduction_reduction e...   
2       1    2981          1_methane_methane emission_detection_leak   
3       2    2735            2_epa epa_epa_federal plan_epa proposes   
4       3    2501                       3_bmw_mini_series_automotive   
5       4    2201                 4_airport_airline_carrier_aircraft   
6       5    1773          5_natural gas_pipeline_natural_gas supply   
7       6    1754                      6_integral_mexico_base_sector   
8       7    1414                           7_fur_basf_euro_deutsche   
9       8    1339                      8_engie_durable_salary_client   
10      9    1199              9_charging_charge_infrastructure_fast   
11     10    1029     10_capacity factor_redispatch_utilization_unit   
12     11    1026                11_waste_recycling_recycled_pla