In [1]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone
load_dotenv(dotenv_path=r"C:\Users\Admin\PycharmProjects\public-comps\.env")

True

In [2]:
# Instantiating pinecone index via the pinecone client
pinecone_index = Pinecone(
    api_key = os.environ.get("PINECONE_API_KEY")
).Index(host=os.environ.get("PINECONE_INDEX_HOST"))


In [1]:
import sqlite3
import json

def fetch_entries_with_embeddings():
    conn = sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\scraping\us_company_info.db')
    c = conn.cursor()
    c.execute('''
              SELECT * FROM companies
              WHERE embedding IS NOT NULL
              ''')
    entries = c.fetchall()
    conn.close()
    return entries

entries = fetch_entries_with_embeddings()

In [2]:
import json
import re
from collections import defaultdict

def normalize_hyphens(text):
    # Replace different types of dashes with a standard hyphen
    text = re.sub(r'[–—−]', '-', text)  # Normalize different hyphens/dashes to a standard hyphen
    return re.sub(r'\s*-\s*', ' - ', text)  # Ensure spaces around hyphens

def prepare_and_deduplicate_data(entries):
    seen_company_tickers = set()
    upsert_data = []

    for entry in entries:
        company_id = str(entry[0])
        company_sec_cik = entry[1] if entry[1] is not None else 'N/A'
        company_name = entry[2] if entry[2] is not None else 'N/A'
        company_ticker = entry[3] if entry[3] is not None else 'N/A'
        company_exchange = entry[4] if entry[4] is not None else 'N/A'
        country = entry[5] if entry[5] is not None else 'N/A'
        industry = entry[6] if entry[6] is not None else 'N/A'
        sector = entry[7] if entry[7] is not None else 'N/A'
        
        # Normalize hyphens in the industry and sector fields
        industry = normalize_hyphens(industry)
        sector = normalize_hyphens(sector)
        
        seen_company_tickers.add(company_ticker)
        
        metadata = {
            'company_sec_cik': company_sec_cik,
            'company_name': company_name,
            'company_ticker': company_ticker,
            'company_exchange': company_exchange,
            'country': country,
            'industry': industry,
            'sector': sector
        }

        embedding = json.loads(entry[9])
        upsert_data.append((company_id, embedding, metadata))
    
    return upsert_data

In [3]:
entries = prepare_and_deduplicate_data(entries)

In [4]:
def get_unique_entries(prepared_data):
    unique_countries = set()
    unique_industries = set()
    unique_sectors = set()
    
    for entry in prepared_data:
        metadata = entry[2]
        
        unique_countries.add(metadata['country'])
        unique_industries.add(metadata['industry'])
        unique_sectors.add(metadata['sector'])
    
    return list(unique_countries), list(unique_industries), list(unique_sectors)


unique_countries, unique_industries, unique_sectors = get_unique_entries(entries)

print("Unique Countries:", unique_countries)
print("Unique Industries:", unique_industries)
print("Unique Sectors:", unique_sectors)

Unique Countries: ['Finland', 'Brazil', 'Malta', 'Argentina', 'Singapore', 'Turkey', 'N/A', 'Jersey', 'Mexico', 'Malaysia', 'Italy', 'Cyprus', 'Costa Rica', 'Macau', 'Gibraltar', 'Norway', 'Luxembourg', 'Kazakhstan', 'Netherlands', 'United States', 'Bermuda', 'New Zealand', 'Colombia', 'Hungary', 'British Virgin Islands', 'China', 'Austria', 'Thailand', 'Poland', 'Jordan', 'Japan', 'Monaco', 'Isle of Man', 'Hong Kong', 'United Kingdom', 'Bahamas', 'Israel', 'Ireland', 'Greece', 'Belgium', 'Canada', 'Indonesia', 'Cayman Islands', 'Sweden', 'Peru', 'Uruguay', 'Guernsey', 'Denmark', 'Philippines', 'Kenya', 'Vietnam', 'Panama', 'India', 'France', 'Switzerland', 'United Arab Emirates', 'Australia', 'South Africa', 'Germany', 'South Korea', 'Portugal', 'Albania', 'Taiwan', 'Chile', 'Spain']
Unique Industries: ['Engineering & Construction', 'Resorts & Casinos', 'Beverages - Wineries & Distilleries', 'Auto Parts', 'Solar', 'Utilities - Regulated Gas', 'Security & Protection Services', 'Thermal

In [None]:
def batch_upsert(upsert_data, batch_size=100):
    batch = []
    for element in upsert_data:
        batch.append((element[0], element[1], element[2]))
        
        if len(batch) == batch_size:
            upsert_response = pinecone_index.upsert(vectors=batch)
            print(f"Upsert response for batch: {upsert_response}")
            batch.clear()  # Clear the batch after upserting

    # Upsert any remaining vectors that didn't fill up a full batch
    if batch:
        upsert_response = pinecone_index.upsert(vectors=batch)
        print(f"Upsert response for remaining batch: {upsert_response}")

batch_upsert(entries)