In [3]:
import json
import pandas as pd
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
# # Download the WordNet corpus
# nltk.download('wordnet')


In [2]:
# Combine data 
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm  # Progress bar for monitoring

# Define your evidence query
evidence_query = "GHG Emission Regulation"

# Define the year range to filter
start_year = 2013
end_year = 2023

# File paths
jsonl_file_path = r'C:\Users\hoath\Git\LobbyMap_ML\data\processed\combined.jsonl'
csv_file_path = r'C:\Users\hoath\Git\LobbyMap_ML\data\processed\company_sector_region.csv'

# Load the company sector and region data
company_info = pd.read_csv(csv_file_path)
company_info_dict = company_info.set_index('company_name').to_dict(orient='index')

# Initialize lists to collect text data, years, sectors, and regions
text_data = []
years = []
sectors = []
regions = []

# Read the JSONL file and extract necessary fields with a progress bar
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
    for line in tqdm(file, total=14000):  # Update total based on actual size
        entry = json.loads(line)
        document_id = entry.get('document_id')
        sentences = entry.get('sentences', [])
        meta_evidences = entry.get('meta', {}).get('evidences', [])

        for evidence in meta_evidences:
            if isinstance(evidence, list):
                for sub_evidence in evidence:
                    company_name = sub_evidence.get('company_name')
                    evidence_year = sub_evidence.get('evidence_year')
                    evidence_query_field = sub_evidence.get('evidence_query')

                    # Filter by the target evidence query and year range
                    if evidence_query_field == evidence_query and start_year <= evidence_year <= end_year:
                        for sentence in sentences:
                            text = sentence['text']
                            if isinstance(text, str) and text.strip():
                                text_data.append(text)
                                years.append(evidence_year)

                                if company_name in company_info_dict:
                                    sector = company_info_dict[company_name]['sector']
                                    region = company_info_dict[company_name]['region']
                                else:
                                    sector = "Unknown"
                                    region = "Unknown"

                                sectors.append(sector)
                                regions.append(region)
            elif isinstance(evidence, dict):
                company_name = evidence.get('company_name')
                evidence_year = evidence.get('evidence_year')
                evidence_query_field = evidence.get('evidence_query')

                # Filter by the target evidence query and year range
                if evidence_query_field == evidence_query and start_year <= evidence_year <= end_year:
                    for sentence in sentences:
                        text = sentence['text']
                        if isinstance(text, str) and text.strip():
                            text_data.append(text)
                            years.append(evidence_year)

                            if company_name in company_info_dict:
                                sector = company_info_dict[company_name]['sector']
                                region = company_info_dict[company_name]['region']
                            else:
                                sector = "Unknown"
                                region = "Unknown"

                            sectors.append(sector)
                            regions.append(region)

# Ensure all elements are strings and remove any that are not
text_data = [str(text) for text in text_data if isinstance(text, str) and text.strip()]

# Check if all lists have the same length
assert len(text_data) == len(years) == len(sectors) == len(regions), "Mismatch between text data, years, sectors, and regions"

# Print the dimensions and first 5 entries of text_data for debugging
print(f"Dimensions of text_data: {len(text_data)}")
print("First 5 entries in text_data:", text_data[:5])
print("First 5 entries in years:", years[:5])
print("First 5 entries in sectors:", sectors[:5])
print("First 5 entries in regions:", regions[:5])




 76%|███████▌  | 10604/14000 [00:05<00:01, 2050.92it/s]

Dimensions of text_data: 557275
First 5 entries in text_data: ['  1', 'COMMENTS OF THE CLASS OF ’85 REGULATORY RESPONSE GROUP', 'ON THE', 'PROPOSED STANDARDS OF PERFORMANCE FOR NEW, RECONSTRUCTED, AND', 'MODIFIED SOURCES AND EMISSIONS GUIDELINES FOR EXISTING SOURCES:']
First 5 entries in years: [2022, 2022, 2022, 2022, 2022]
First 5 entries in sectors: ['Electric Utilities', 'Electric Utilities', 'Electric Utilities', 'Electric Utilities', 'Electric Utilities']
First 5 entries in regions: ['North America', 'North America', 'North America', 'North America', 'North America']





In [3]:
# Check the unique years/sectors/regions in the dataset
unique_years = sorted(set(years))
print(f"Unique years in the dataset: {unique_years}")

unique_sector = sorted(set(sectors))
print(f"Unique sectors in the dataset: {unique_sector}")

unique_region = sorted(set(regions))
print(f"Unique years in the dataset: {unique_region}")


Unique years in the dataset: [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
Unique sectors in the dataset: ['Airlines', 'Automobiles', 'Chemicals', 'Consumer Goods & Services', 'Diversified Mining', 'Electric Utilities', 'Oil & Gas', 'Oil & Gas Distribution', 'Other transportation', 'Steel', 'Unknown', 'cement', 'other industrials', 'paper', 'shipping']
Unique years in the dataset: ['Africa', 'Asia', 'Australasia', 'Europe', 'Middle East', 'North America', 'South America', 'Unknown']


In [5]:
# Preprocessing without region, year, and sector information
import spacy
import re
import unicodedata
import pickle
import os
import numpy as np
from nltk.stem import WordNetLemmatizer
from stopwordsiso import stopwords  # Import stopwords-iso

# Load spaCy model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger'])
lemmatizer = WordNetLemmatizer()

# Define languages for which you want to include stopwords (those with at least 10 documents)
languages = [
    'af', 'ca', 'cy', 'da', 'de', 'en', 'et', 'es', 'fi', 'fr', 'hr',
    'hu', 'id', 'it', 'nl', 'no', 'ro', 'pt', 'pl', 'tl', 'vi', 'sv',
    'sl', 'so', 'sw', 'sq', 'lt', 'tr', 'sk'
]

# Create a combined stopwords list from the selected languages
multilingual_stopwords = set()
for lang in languages:
    multilingual_stopwords.update(stopwords(lang))

# Define additional stopwords based on your analysis
additional_stopwords = {
    "14", "50", "70", "142", "report", "combined", "statements", "corporate",
    "financial", "results", "services", "emissions", "climate", "action",
    "agreement", "protection", "____", "activities", "individual", "units", "source", 
    "vehicle", "22", "23", "26", "2022", "30", "40", "25", "33", "12", "13", "15", 
    "U.S.C.", "Reg.", "_", "__", "page", "additional", "dow", "edf", "developpement", 
    "bmw", "reg", "fig", "data", "figure", "introduction", "company", "policy", 
    "login", "file", "user", "download", "read", "january", "february", "march", 
    "april", "may", "june", "july", "august", "september", "october", "november", 
    "december", "fy2017", "fy2018", "fy2019", "fy2020", "contact", "phone", "fax", 
    "email", "address", "http", "https", "www", "website", "association", "trade", 
    "management", "european", "europe", "gas", "content", "search", "chapter", "mixed", 
    "segment", "image", "vision", "bser", "fpls", "fpl", "caap", "statute", "guidance", 
    "chief", "officer", "stakeholder", "sincerely", "manager", "associate", "ceo", 
    "executive", "chairman", "director", "president", "and", "for", "the", "you", 
    "section", "compliance", "docket", "fy2021", "fy2022", "fy2023", "fyXX", "telephone", 
    "call", "reach", "connect", "line", "extension", "web", "url", "site", "link", "click", 
    "net", "com", "org", "gov", "domain", "facebook", "linkedin", "twitter", "instagram", 
    "social", "media", "blog", "subscribe", "pdf", "epa", "industrial", "act", "regulation", 
    "comment", "public", "administrator", "judgment", "category", "regulatory", "health", 
    "air", "pollution", "cause", "significantly", "endanger", "due", "potential"
}

# Add the additional stopwords to the multilingual stopwords set
multilingual_stopwords.update(additional_stopwords)

# Convert the multilingual stopwords set back to a list for use in preprocessing
multilingual_stopwords = list(multilingual_stopwords)

# Function to precompute named entities and store them
def precompute_named_entities(text_data, output_file="ner_precomputed.pkl"):
    # Check if the precomputed file already exists
    if os.path.exists(output_file):
        print(f"Precomputed NER data found at {output_file}. Skipping re-computation.")
        return load_precomputed_named_entities(output_file)
    
    print(f"Precomputing NER and saving to {output_file}")
    docs = list(nlp.pipe(text_data, batch_size=50))
    named_entities = []

    # Extract and remove PERSON and GPE entities from the documents
    for doc in docs:
        cleaned_text = ' '.join([token.text for token in doc if token.ent_type_ not in ['PERSON', 'GPE']])
        named_entities.append(cleaned_text)

    # Save the precomputed named entities to a file
    with open(output_file, 'wb') as f:
        pickle.dump(named_entities, f)

    return named_entities

# Function to load precomputed named entities
def load_precomputed_named_entities(file="ner_precomputed.pkl"):
    with open(file, 'rb') as f:
        precomputed_ner_data = pickle.load(f)
    return precomputed_ner_data

# Preprocessing function with NER precomputation integration
def preprocess_text(text):
    # Check if text is not None or empty
    if not text or not isinstance(text, str):
        return None

    # Normalize text (remove accents)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

    # Convert text to lowercase
    text = text.lower()

   # Step 1: Remove legal references and section numbers (handling variations of references)
    text = re.sub(r'§*\s*\d+[\(\d*a-zA-Z\)]*[\w]*[\d\(\)]*', '', text)  # Handles cases like "§ 7687(d)(9)" and similar section references
    text = re.sub(r'\bsection\s*\d+[a-zA-Z\(\)]*\b', '', text, flags=re.IGNORECASE)  # Handles "Section 111(b)" and similar
    text = re.sub(r'\barticle\s*\d+[a-zA-Z\(\)]*\b', '', text, flags=re.IGNORECASE)  # Handles "Article 9(b)" and similar
    text = re.sub(r'\b\d+u\.s\.c\.\b', '', text)  # Handles U.S.C. sections (e.g., "§ 706(2)")
    text = re.sub(r'\b\d+\s*(cfr|usc|u\.s\.c\.|u\.s\.)\b', '', text)  # Handles CFR/USC references
    text = re.sub(r'\b[a-zA-Z]*\d+[a-zA-Z]*\b', '', text)  # Removes alphanumeric tokens (e.g., "EAP-HQ-2021")
    text = re.sub(r'\b(?:scf|only|initial)\b', '', text, flags=re.IGNORECASE)  # Specifically removes "SCF", "only", and "initial"
    text = re.sub(r'\b\d+[a-zA-Z\(\)\d]+\b', '', text)  # Handles remaining alphanumeric tokens (e.g., "706(2)")


    # Step 2: Remove standalone numbers and dates
    text = re.sub(r'\b\d{1,4}\b', '', text)  # Removes standalone 1-4 digit numbers (e.g., "85", "2020", "21")
    text = re.sub(r'\b\d+[a-z]*\b', '', text)  # Removes alphanumeric tokens (e.g., "50th", "85")
    text = re.sub(r'\b\d+[-/]\d+[-/]\d+\b', '', text)  # Removes date formats (e.g., "12/12/2020" or "12-12-2020")
    text = re.sub(r'\b(?:\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{2,4})\b', '', text)  # Additional date format handling (e.g., "12/12/2020")
    text = re.sub(r'\b\w*\d+\w*\b', '', text)  # Removes alphanumeric tokens (both letters and numbers, e.g., "EAP-HQ-2021")


    # Consolidated Preprocessing Code for URLs and Special Characters
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Removes all forms of http/https and www URLs
    text = re.sub(r'[^\w\s]', '', text)  # Removes any remaining special characters


    # Step 4: Remove underscores, commas, and special characters, then replace multiple spaces
    text = re.sub(r'[_,\s]+', ' ', text)  # This handles underscores, commas, and multiple spaces
 
    
    # Step 5: Remove any remaining special characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove all non-alphanumeric characters except spaces
    

    # Remove month abbreviations (case-insensitive)
    months_abbr = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    months_pattern = re.compile(r'\b(' + '|'.join(months_abbr) + r')\b', flags=re.IGNORECASE)
    text = months_pattern.sub('', text)

    # Pattern to match "fy" followed by either two or four digits (e.g., fy23 or fy2023)
    fy_pattern = re.compile(r'\bfy\d{2,4}\b', flags=re.IGNORECASE)
    text = fy_pattern.sub('', text)

    # Improved PDF Removal: This will capture patterns with 'pdf', whether standalone or within URLs
    text = re.sub(r'\S+\.pdf\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(pdf\w*|print\w*)\b', '', text, flags=re.IGNORECASE)  # Removes any word starting with 'pdf' or 'print'

    text = re.sub(r'\b(?:epa|us|gpe|nasa|inc|co)\b', '', text, flags=re.IGNORECASE)  # Add any specific abbreviations here

    text = re.sub(r'\b(?:epa|policy|section)\s*\d+[a-zA-Z\(\)]*\b', '', text)  # Handle text like "EPA Policy Section 111"



    # Remove multilingual stopwords
    text = ' '.join([word for word in text.split() if word not in multilingual_stopwords])


    # Lemmatize each word in the text
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


    # Remove short words (less than 3 characters) and strip extra spaces
    text = ' '.join([word for word in text.split() if len(word) > 2])

    # Additional step: Ensure no multiple spaces after cleaning
    text = re.sub(r'\s+', ' ', text).strip()

    text = text.strip()
   
    return text if len(text.split()) > 2 else None

# Check if NER is precomputed, otherwise precompute and load
ner_output_file = 'C:/Users/hoath/Git/LobbyMap_ML/Embeddings/ner_GHG.pkl'

precomputed_ner_data = precompute_named_entities(text_data, output_file=ner_output_file)

# Apply the rest of the preprocessing after named entities are removed
cleaned_data = [preprocess_text(doc) for doc in precomputed_ner_data]

# Remove any entries that are empty or consist only of whitespace after preprocessing
cleaned_data = [doc for doc in cleaned_data if doc and doc.strip()]

# save file: 
with open(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\cleaned_data.pkl', 'wb') as f:
    pickle.dump(cleaned_data, f)


# Print the number of documents before and after cleaning
print(f"Number of documents after cleaning: {len(cleaned_data)}")


Precomputed NER data found at C:/Users/hoath/Git/LobbyMap_ML/Embeddings/ner_GHG.pkl. Skipping re-computation.
Number of documents after cleaning: 318839


In [4]:
# Preprocessing with region, sector and year information
import spacy
import re
import unicodedata
import pickle
import os
import numpy as np
from nltk.stem import WordNetLemmatizer
from stopwordsiso import stopwords  # Import stopwords-iso

# Load spaCy model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger'])
lemmatizer = WordNetLemmatizer()

# Define languages for which you want to include stopwords (those with at least 10 documents)
languages = [
    'af', 'ca', 'cy', 'da', 'de', 'en', 'et', 'es', 'fi', 'fr', 'hr',
    'hu', 'id', 'it', 'nl', 'no', 'ro', 'pt', 'pl', 'tl', 'vi', 'sv',
    'sl', 'so', 'sw', 'sq', 'lt', 'tr', 'sk'
]

# Create a combined stopwords list from the selected languages
multilingual_stopwords = set()
for lang in languages:
    multilingual_stopwords.update(stopwords(lang))

# Define additional stopwords based on your analysis
additional_stopwords = {
    "14", "50", "70", "142", "report", "combined", "statements", "corporate",
    "financial", "results", "services", "emissions", "climate", "action",
    "agreement", "protection", "____", "activities", "individual", "units", "source", 
    "vehicle", "22", "23", "26", "2022", "30", "40", "25", "33", "12", "13", "15", 
    "U.S.C.", "Reg.", "_", "__", "page", "additional", "dow", "edf", "developpement", 
    "bmw", "reg", "fig", "data", "figure", "introduction", "company", "policy", 
    "login", "file", "user", "download", "read", "january", "february", "march", 
    "april", "may", "june", "july", "august", "september", "october", "november", 
    "december", "fy2017", "fy2018", "fy2019", "fy2020", "contact", "phone", "fax", 
    "email", "address", "http", "https", "www", "website", "association", "trade", 
    "management", "european", "europe", "gas", "content", "search", "chapter", "mixed", 
    "segment", "image", "vision", "bser", "fpls", "fpl", "caap", "statute", "guidance", 
    "chief", "officer", "stakeholder", "sincerely", "manager", "associate", "ceo", 
    "executive", "chairman", "director", "president", "and", "for", "the", "you", 
    "section", "compliance", "docket", "fy2021", "fy2022", "fy2023", "fyXX", "telephone", 
    "call", "reach", "connect", "line", "extension", "web", "url", "site", "link", "click", 
    "net", "com", "org", "gov", "domain", "facebook", "linkedin", "twitter", "instagram", 
    "social", "media", "blog", "subscribe", "pdf", "epa", "industrial", "act", "regulation", 
    "comment", "public", "administrator", "judgment", "category", "regulatory", "health", 
    "air", "pollution", "cause", "significantly", "endanger", "due", "potential"
}

# Add the additional stopwords to the multilingual stopwords set
multilingual_stopwords.update(additional_stopwords)

# Convert the multilingual stopwords set back to a list for use in preprocessing
multilingual_stopwords = list(multilingual_stopwords)

# Function to precompute named entities and store them
def precompute_named_entities(text_data, output_file="ner_precomputed.pkl"):
    # Check if the precomputed file already exists
    if os.path.exists(output_file):
        print(f"Precomputed NER data found at {output_file}. Skipping re-computation.")
        return load_precomputed_named_entities(output_file)
    
    print(f"Precomputing NER and saving to {output_file}")
    docs = list(nlp.pipe(text_data, batch_size=50))
    named_entities = []

    # Extract and remove PERSON and GPE entities from the documents
    for doc in docs:
        cleaned_text = ' '.join([token.text for token in doc if token.ent_type_ not in ['PERSON', 'GPE']])
        named_entities.append(cleaned_text)

    # Save the precomputed named entities to a file
    with open(output_file, 'wb') as f:
        pickle.dump(named_entities, f)

    return named_entities

# Function to load precomputed named entities
def load_precomputed_named_entities(file="ner_precomputed.pkl"):
    with open(file, 'rb') as f:
        precomputed_ner_data = pickle.load(f)
    return precomputed_ner_data

# Preprocessing function with NER precomputation integration
def preprocess_text(text):
    # Check if text is not None or empty
    if not text or not isinstance(text, str):
        return None

    # Normalize text (remove accents)
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

    # Convert text to lowercase
    text = text.lower()

   # Step 1: Remove legal references and section numbers (handling variations of references)
    text = re.sub(r'§*\s*\d+[\(\d*a-zA-Z\)]*[\w]*[\d\(\)]*', '', text)  # Handles cases like "§ 7687(d)(9)" and similar section references
    text = re.sub(r'\bsection\s*\d+[a-zA-Z\(\)]*\b', '', text, flags=re.IGNORECASE)  # Handles "Section 111(b)" and similar
    text = re.sub(r'\barticle\s*\d+[a-zA-Z\(\)]*\b', '', text, flags=re.IGNORECASE)  # Handles "Article 9(b)" and similar
    text = re.sub(r'\b\d+u\.s\.c\.\b', '', text)  # Handles U.S.C. sections (e.g., "§ 706(2)")
    text = re.sub(r'\b\d+\s*(cfr|usc|u\.s\.c\.|u\.s\.)\b', '', text)  # Handles CFR/USC references
    text = re.sub(r'\b[a-zA-Z]*\d+[a-zA-Z]*\b', '', text)  # Removes alphanumeric tokens (e.g., "EAP-HQ-2021")
    text = re.sub(r'\b(?:scf|only|initial)\b', '', text, flags=re.IGNORECASE)  # Specifically removes "SCF", "only", and "initial"
    text = re.sub(r'\b\d+[a-zA-Z\(\)\d]+\b', '', text)  # Handles remaining alphanumeric tokens (e.g., "706(2)")


    # Step 2: Remove standalone numbers and dates
    text = re.sub(r'\b\d{1,4}\b', '', text)  # Removes standalone 1-4 digit numbers (e.g., "85", "2020", "21")
    text = re.sub(r'\b\d+[a-z]*\b', '', text)  # Removes alphanumeric tokens (e.g., "50th", "85")
    text = re.sub(r'\b\d+[-/]\d+[-/]\d+\b', '', text)  # Removes date formats (e.g., "12/12/2020" or "12-12-2020")
    text = re.sub(r'\b(?:\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{2,4})\b', '', text)  # Additional date format handling (e.g., "12/12/2020")
    text = re.sub(r'\b\w*\d+\w*\b', '', text)  # Removes alphanumeric tokens (both letters and numbers, e.g., "EAP-HQ-2021")


    # Consolidated Preprocessing Code for URLs and Special Characters
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Removes all forms of http/https and www URLs
    text = re.sub(r'[^\w\s]', '', text)  # Removes any remaining special characters


    # Step 4: Remove underscores, commas, and special characters, then replace multiple spaces
    text = re.sub(r'[_,\s]+', ' ', text)  # This handles underscores, commas, and multiple spaces
 
    
    # Step 5: Remove any remaining special characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove all non-alphanumeric characters except spaces
    

    # Remove month abbreviations (case-insensitive)
    months_abbr = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    months_pattern = re.compile(r'\b(' + '|'.join(months_abbr) + r')\b', flags=re.IGNORECASE)
    text = months_pattern.sub('', text)

    # Pattern to match "fy" followed by either two or four digits (e.g., fy23 or fy2023)
    fy_pattern = re.compile(r'\bfy\d{2,4}\b', flags=re.IGNORECASE)
    text = fy_pattern.sub('', text)

    # Improved PDF Removal: This will capture patterns with 'pdf', whether standalone or within URLs
    text = re.sub(r'\S+\.pdf\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(pdf\w*|print\w*)\b', '', text, flags=re.IGNORECASE)  # Removes any word starting with 'pdf' or 'print'

    text = re.sub(r'\b(?:epa|us|gpe|nasa|inc|co)\b', '', text, flags=re.IGNORECASE)  # Add any specific abbreviations here

    text = re.sub(r'\b(?:epa|policy|section)\s*\d+[a-zA-Z\(\)]*\b', '', text)  # Handle text like "EPA Policy Section 111"



    # Remove multilingual stopwords
    text = ' '.join([word for word in text.split() if word not in multilingual_stopwords])


    # Lemmatize each word in the text
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


    # Remove short words (less than 3 characters) and strip extra spaces
    text = ' '.join([word for word in text.split() if len(word) > 2])

    # Additional step: Ensure no multiple spaces after cleaning
    text = re.sub(r'\s+', ' ', text).strip()

    text = text.strip()
   
    return text if len(text.split()) > 2 else None



# Initialize a list to store cleaned data along with metadata
cleaned_data_with_metadata = []

# Ensure NER is precomputed, otherwise precompute and load
ner_output_file = 'C:/Users/hoath/Git/LobbyMap_ML/Embeddings/ner_GHG.pkl'
precomputed_ner_data = precompute_named_entities(text_data, output_file=ner_output_file)


# Apply the rest of the preprocessing after named entities are removed
for i, doc in enumerate(precomputed_ner_data):
    cleaned_text = preprocess_text(doc)  # Clean the text
    if cleaned_text and cleaned_text.strip():
        # Append a dictionary that includes the cleaned text along with its metadata
        cleaned_data_with_metadata.append({
            'text': cleaned_text,
            'year': years[i],
            'region': regions[i],
            'sector': sectors[i]
        })

# Save cleaned data with metadata to a file
with open(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\cleaned_data_with_metadata.pkl', 'wb') as f:
    pickle.dump(cleaned_data_with_metadata, f)

# Print the number of documents before and after cleaning
print(f"Number of documents after cleaning: {len(cleaned_data_with_metadata)}")



Precomputed NER data found at C:/Users/hoath/Git/LobbyMap_ML/Embeddings/ner_GHG.pkl. Skipping re-computation.
Number of documents after cleaning: 318839


In [6]:
# Load the cleaned data with metadata
with open(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\cleaned_data_with_metadata.pkl', 'rb') as f:
    cleaned_data_with_metadata = pickle.load(f)

# Example: Access the first entry's cleaned text, year, region, and sector
first_entry = cleaned_data_with_metadata[0]
print(f"Cleaned Text: {first_entry['text']}")
print(f"Year: {first_entry['year']}")
print(f"Region: {first_entry['region']}")
print(f"Sector: {first_entry['sector']}")


Cleaned Text: comment class response
Year: 2022
Region: North America
Sector: Electric Utilities


In [46]:
print(f"Dimensions of text_data: {len(cleaned_data)}")

Dimensions of text_data: 318839


In [35]:
print(f"Number of documents in text_data before preprocessing: {len(text_data)}")


Number of documents in text_data before preprocessing: 557275


In [36]:
# Print a few examples of the preprocessed text
print(cleaned_data[:5])


['comment class response group', 'proposed standard performance new reconstructed', 'modified source guideline existing source', 'oil natural sector review', 'united state environmental agency']


In [37]:
# Check some examples of text before preprocessing
print("Sample before preprocessing:", text_data[:5])

# Apply the preprocessing step
preprocessed_text_data = [preprocess_text(doc) for doc in text_data]

# Check some examples after preprocessing
print("Sample after preprocessing:", preprocessed_text_data[:5])


Sample before preprocessing: ['  1', 'COMMENTS OF THE CLASS OF ’85 REGULATORY RESPONSE GROUP', 'ON THE', 'PROPOSED STANDARDS OF PERFORMANCE FOR NEW, RECONSTRUCTED, AND', 'MODIFIED SOURCES AND EMISSIONS GUIDELINES FOR EXISTING SOURCES:']
Sample after preprocessing: [None, 'comment class response group', None, 'proposed standard performance new reconstructed', 'modified source guideline existing source']


In [38]:
# Identify and remove empty or very short documents
cleaned_data = [doc for doc in text_data if len(doc.strip()) > 3]

# Check if any documents were too short or empty
empty_docs = len(text_data) - len(cleaned_data)
print(f"Number of empty or too short documents removed: {empty_docs}")


Number of empty or too short documents removed: 58948


In [None]:
import re

# Function to check if specific patterns are present
def check_for_patterns(text_data):
    url_pattern = re.compile(r'(http[s]?://\S+|www\.\S+)')
    month_abbr = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    pdf_pattern = re.compile(r'\b(pdf\w*|print\w*)\b', re.IGNORECASE)
    
    for i, doc in enumerate(text_data):
        if url_pattern.search(doc) or any(month in doc for month in month_abbr) or pdf_pattern.search(doc):
            print(f"Potential issues found in document {i}:")
            print(doc)

# Run the check on your preprocessed text data
check_for_patterns(cleaned_data)


In [47]:
# Embedding 
from sentence_transformers import SentenceTransformer
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bar

# Define a custom dataset class for SentenceTransformer
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# Load the SentenceTransformer model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Explicitly set the device to CPU
device = "cpu"

# Create a DataLoader for batch processing
batch_size = 16  # Reduce batch size to avoid memory issues
num_workers = 0  # Number of CPU cores to use for data loading


dataset = TextDataset(cleaned_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

# Debugging: Ensure that DataLoader is working properly
print("DataLoader length:", len(dataloader))

# Generate embeddings in batches with a progress bar
embeddings = []
for batch in tqdm(dataloader, desc="Generating Embeddings", unit="batch"):
    # Ensure batch is processed on the CPU
    batch_embeddings = embedding_model.encode(batch, convert_to_tensor=True, device=device)
    embeddings.append(batch_embeddings)

    # Debugging: Print after each batch to ensure loop is running
    #print(f"Processed batch size: {len(batch_embeddings)}")

# Concatenate all the embeddings into one tensor
embeddings = torch.cat(embeddings, dim=0)

# Convert back to NumPy array if needed
embeddings = embeddings.cpu().numpy()

# Step 2: Save the embeddings to a .npy file
np.save(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_clean.npy', embeddings)
print("Embeddings saved successfully.")


DataLoader length: 19928


Generating Embeddings: 100%|██████████| 19928/19928 [32:50<00:00, 10.12batch/s] 


Embeddings saved successfully.


In [48]:
import numpy as np

# Load the embeddings
embeddings = np.load(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_clean.npy')

# Check the shape of the embeddings
print("Shape of embeddings:", embeddings.shape)


Shape of embeddings: (318839, 384)


In [9]:
# Initial search for optimal number of topics
# Necessary Imports
import numpy as np
from sklearn.metrics import silhouette_score
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

# Define UMAP and HDBSCAN models for topic modeling
umap_model = UMAP(random_state=42)
hdbscan_model = HDBSCAN(min_samples=5, min_cluster_size=10)

# Load data: 
with open(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\cleaned_data.pkl', 'rb') as f:
    cleaned_data = pickle.load(f)

# Load the embeddings
embeddings = np.load(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_clean.npy')


# Function to find the best number of topics using silhouette score
def find_best_num_topics(cleaned_data, precomputed_embeddings, topic_numbers=[2, 5, 10, 15, 20]):
    best_score = -1
    best_num_topics = None
    best_model = None

    # Loop over the specified topic numbers
    for num_topics in topic_numbers:
        print(f"Testing {num_topics} topics...")

        # Create BERTopic model with the specified number of topics
        topic_model = BERTopic(nr_topics=num_topics, embedding_model=None, umap_model=umap_model, hdbscan_model=hdbscan_model)

        # Fit the model on the text (cleaned_data) and embeddings (precomputed_embeddings)
        topics, probabilities = topic_model.fit_transform(cleaned_data, precomputed_embeddings)

        # Calculate the silhouette score
        if len(set(topics)) > 1:  # Silhouette score requires at least 2 clusters
            score = silhouette_score(precomputed_embeddings, topics)
            print(f"Silhouette score for {num_topics} topics: {score}")

            # Update the best model if this one is better
            if score > best_score:
                best_score = score
                best_num_topics = num_topics
                best_model = topic_model

    print(f"Best number of topics: {best_num_topics} with silhouette score: {best_score}")
    return best_model, best_num_topics

# Example usage
# cleaned_data: Preprocessed text data (your text input)
# precomputed_embeddings: Embeddings generated for cleaned_data

# Find the best number of topics for 2, 5, 10, 15, and 20 topics
best_model, best_num_topics = find_best_num_topics(cleaned_data, embeddings)


Testing 2 topics...
Silhouette score for 2 topics: -0.002788221463561058
Testing 5 topics...
Silhouette score for 5 topics: -0.008580033667385578
Testing 10 topics...
Silhouette score for 10 topics: -0.008722503669559956
Testing 15 topics...
Silhouette score for 15 topics: -0.014485998079180717
Testing 20 topics...
Silhouette score for 20 topics: -0.018786627799272537
Best number of topics: 2 with silhouette score: -0.002788221463561058


In [13]:
# Further search for optimal number of topics
import numpy as np
from sklearn.metrics import silhouette_score
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

# Load the precomputed embeddings and cleaned data
with open(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\cleaned_data.pkl', 'rb') as f:
    cleaned_data = pickle.load(f)

embeddings = np.load(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_clean.npy')

# Load the SentenceTransformer model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Define UMAP and HDBSCAN models for dimensionality reduction and clustering
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Define the ClassTfidfTransformer with both recommended parameters
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# Define the Maximal Marginal Relevance (MMR) model for topic representation
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# Define the KeyBERTInspired model for further refinement
keybert_model = KeyBERTInspired()

# Chain the MMR and KeyBERTInspired models together
representation_model = [mmr_model, keybert_model]


# Function to find the best number of topics using silhouette score and topic coherence
def find_best_num_topics(cleaned_data, precomputed_embeddings, topic_numbers=[2, 3, 4]):
    best_silhouette_score = -1
    best_num_topics = None
    best_model_silhouette = None


    # Loop over the specified topic numbers
    for num_topics in topic_numbers:
        print(f"Testing {num_topics} topics...")

        # Initialize BERTopic with custom settings
        topic_model = BERTopic(nr_topics=num_topics,
                               embedding_model=embedding_model,
                               umap_model=umap_model,
                               hdbscan_model=hdbscan_model,
                               ctfidf_model=ctfidf_model,
                               representation_model=representation_model)
        

        # Fit the model on the text (cleaned_data) and embeddings (precomputed_embeddings)
        topics, probs = topic_model.fit_transform(cleaned_data, embeddings=embeddings)

        
        # Calculate the silhouette score
        silhouette = -1  # Initialize to handle errors
        if len(set(topics)) > 1:  # Silhouette score requires at least 2 clusters
            try:
                silhouette = silhouette_score(embeddings, topics)
                print(f"Silhouette score for {num_topics} topics: {silhouette}")
            except ValueError as e:
                print(f"Error calculating silhouette score for {num_topics} topics: {e}")
            
            # Update the best model for silhouette score
            if silhouette > best_silhouette_score:
                best_silhouette_score = silhouette
                best_num_topics = num_topics
                best_model_silhouette = topic_model


    # Print final best results
    print(f"\nBest model based on Silhouette Score: {best_num_topics} with silhouette score: {best_silhouette_score}")
   

    return best_model_silhouette

# Search for best number of topics based on silhouette score and topic coherence
best_model_silhouette = find_best_num_topics(cleaned_data, embeddings)


Testing 2 topics...
Silhouette score for 2 topics: -0.0005553557421080768
Testing 3 topics...
Silhouette score for 3 topics: -0.00076147576328367
Testing 4 topics...
Silhouette score for 4 topics: -0.002105161314830184

Best model based on Silhouette Score: 2 with silhouette score: -0.0005553557421080768


In [14]:
#  Search for MMR with fixed number of topics (2)
import numpy as np
from sklearn.metrics import silhouette_score
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

# Load the precomputed embeddings and cleaned data
with open(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\cleaned_data.pkl', 'rb') as f:
    cleaned_data = pickle.load(f)

embeddings = np.load(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_clean.npy')

# Load the SentenceTransformer model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Define UMAP and HDBSCAN models for dimensionality reduction and clustering
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Define the ClassTfidfTransformer with both recommended parameters
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# Define the KeyBERTInspired model for further refinement
keybert_model = KeyBERTInspired()


# Function to tune MMR for fixed number of topics (2)
def tune_mmr(cleaned_data, precomputed_embeddings, mmr_values=[0.3, 0.5, 0.7]):
    best_silhouette_score = -1
    best_mmr = None
    best_model_silhouette = None

    # Loop over the specified MMR values
    for mmr_value in mmr_values:
        print(f"Testing MMR with diversity={mmr_value}...")

        # Define the Maximal Marginal Relevance (MMR) model with the current diversity value
        mmr_model = MaximalMarginalRelevance(diversity=mmr_value)

        # Chain the MMR and KeyBERTInspired models together
        representation_model = [mmr_model, keybert_model]

        # Initialize BERTopic with fixed number of topics (2) and custom MMR
        topic_model = BERTopic(nr_topics=2,
                               embedding_model=embedding_model,
                               umap_model=umap_model,
                               hdbscan_model=hdbscan_model,
                               ctfidf_model=ctfidf_model,
                               representation_model=representation_model)

        # Fit the model on the text (cleaned_data) and embeddings (precomputed_embeddings)
        topics, probs = topic_model.fit_transform(cleaned_data, embeddings=embeddings)

        # Calculate the silhouette score
        silhouette = -1  # Initialize to handle errors
        if len(set(topics)) > 1:  # Silhouette score requires at least 2 clusters
            try:
                silhouette = silhouette_score(embeddings, topics)
                print(f"Silhouette score for MMR diversity={mmr_value}: {silhouette}")
            except ValueError as e:
                print(f"Error calculating silhouette score for MMR diversity={mmr_value}: {e}")

            # Update the best model for silhouette score
            if silhouette > best_silhouette_score:
                best_silhouette_score = silhouette
                best_mmr = mmr_value
                best_model_silhouette = topic_model

    # Print final best results
    print(f"\nBest model based on Silhouette Score: MMR diversity={best_mmr} with silhouette score: {best_silhouette_score}")

    return best_model_silhouette

# Tune MMR for best diversity value
best_model_silhouette = tune_mmr(cleaned_data, embeddings)



Testing MMR with diversity=0.3...
Silhouette score for MMR diversity=0.3: -0.0008365029934793711
Testing MMR with diversity=0.5...
Silhouette score for MMR diversity=0.5: -0.0006165122613310814
Testing MMR with diversity=0.7...
Silhouette score for MMR diversity=0.7: -0.0008051433251239359

Best model based on Silhouette Score: MMR diversity=0.5 with silhouette score: -0.0006165122613310814


In [15]:
# FINAL MODEL with fixed number of topics and MMR
import numpy as np
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer

# Load data
with open(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\cleaned_data.pkl', 'rb') as f:
    cleaned_data = pickle.load(f)

# Load the precomputed embeddings
embeddings = np.load(r'C:\Users\hoath\Git\LobbyMap_ML\Embeddings\embeddings_GHG_clean.npy')

# Load the SentenceTransformer model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Define UMAP and HDBSCAN models for dimensionality reduction and clustering
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Define the ClassTfidfTransformer with both recommended parameters
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# Set Maximal Marginal Relevance (MMR) model for topic representation with diversity = 0.5
mmr_model = MaximalMarginalRelevance(diversity=0.5)

# Define the KeyBERTInspired model for further refinement
keybert_model = KeyBERTInspired()

# Chain the MMR and KeyBERTInspired models together
representation_model = [mmr_model, keybert_model]

# Initialize BERTopic model with UMAP, HDBSCAN, customized ClassTfidfTransformer, and combined representation models
topic_model = BERTopic(nr_topics=2,  # Fixed number of topics to 2
                       umap_model=umap_model,
                       hdbscan_model=hdbscan_model,
                       embedding_model=embedding_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model)

# Ensure cleaned_data is a list of strings
if not isinstance(cleaned_data, list):
    cleaned_data = list(cleaned_data)

# Fit the BERTopic model on the precomputed embeddings and text data
topics, probs = topic_model.fit_transform(cleaned_data, embeddings=embeddings)

# Check if representative documents are being generated
topic_info = topic_model.get_topic_info()
print(topic_info.head(20))

# Save the updated topic model for future use
topic_model.save(r"C:\Users\hoath\Git\LobbyMap_ML\Models\topic_model_GHG_clean.pkl",
                 save_ctfidf=True, save_embedding_model=embedding_model)






   Topic   Count                                       Name  \
0     -1  138121    -1_sector_renewable_utility_sustainable   
1      0  180718  0_efficiency_fuel_strategy_sustainability   

                                      Representation  \
0  [sector, renewable, utility, sustainable, stra...   
1  [efficiency, fuel, strategy, sustainability, r...   

                                 Representative_Docs  
0  [reduce ghg avoiding disruptive economic impac...  
1  [energy conservation efficiency energy strateg...  


  self._set_arrayXarray(i, j, x)


In [19]:
# Check if representative documents are being generated
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

   Topic   Count                                       Name  \
0     -1  138121    -1_sector_renewable_utility_sustainable   
1      0  180718  0_efficiency_fuel_strategy_sustainability   

                                      Representation  \
0  [sector, renewable, utility, sustainable, stra...   
1  [efficiency, fuel, strategy, sustainability, r...   

                                 Representative_Docs  
0  [reduce ghg avoiding disruptive economic impac...  
1  [energy conservation efficiency energy strateg...  
