In [1]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone
load_dotenv(dotenv_path=r"C:\Users\Admin\PycharmProjects\public-comps\.env")

True

In [2]:
# Instantiating pinecone index via the pinecone client
pinecone_index = Pinecone(
    api_key = os.environ.get("PINECONE_API_KEY")
).Index(host=os.environ.get("PINECONE_INDEX_HOST"))


In [3]:
import sqlite3
import json

def fetch_entries_with_embeddings():
    conn = sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\merging\merged_company_deduplicated.db')
    c = conn.cursor()
    c.execute('''
              SELECT * FROM companies
              WHERE embedding IS NOT NULL
              ''')
    entries = c.fetchall()
    conn.close()
    return entries

entries = fetch_entries_with_embeddings()

In [7]:
import json
import re
import sqlite3
from collections import defaultdict

def normalize_hyphens(text):
    # Replace different types of dashes with a standard hyphen
    text = re.sub(r'[–—−]', '-', text)  # Normalize different hyphens/dashes to a standard hyphen
    return re.sub(r'\s*-\s*', ' - ', text)  # Ensure spaces around hyphens

def prepare_and_deduplicate_data(entries):
    seen_company_tickers = set()
    upsert_data = []

    for entry in entries:
        
        company_id = str(entry[0])
        company_sec_cik = entry[1] if entry[1] is not None else 'N/A'
        company_name = entry[2] if entry[2] is not None else 'N/A'
        company_isin = entry[3] if entry[3] is not None else 'N/A'
        company_ticker = entry[4] if entry[4] is not None else 'N/A'
        company_exchange = entry[5] if entry[5] is not None else 'N/A'
        country = entry[6] if entry[6] is not None else 'N/A'
        industry = entry[7] if entry[7] is not None else 'N/A'
        sector = entry[8] if entry[8] is not None else 'N/A'
        long_business_summary = entry[9] if entry[9] is not None else 'N/A'

        # Normalize hyphens in the industry and sector fields
        industry = normalize_hyphens(industry)
        sector = normalize_hyphens(sector)
        
        seen_company_tickers.add(company_ticker)
        
        metadata = {
            'company_sec_cik': company_sec_cik,
            'company_name': company_name,
            'company_isin': company_isin,
            'company_ticker': company_ticker,
            'company_exchange': company_exchange,
            'country': country,
            'industry': industry,
            'sector': sector,
            'long_business_summary': long_business_summary
        }

        embedding = json.loads(entry[10])
        upsert_data.append((company_id, embedding, metadata))
    
    return upsert_data

In [8]:
entries = prepare_and_deduplicate_data(entries)

In [9]:
def get_unique_entries(prepared_data):
    unique_countries = set()
    unique_industries = set()
    unique_sectors = set()
    
    for entry in prepared_data:
        metadata = entry[2]
        
        unique_countries.add(metadata['country'])
        unique_industries.add(metadata['industry'])
        unique_sectors.add(metadata['sector'])
    
    return list(unique_countries), list(unique_industries), list(unique_sectors)


unique_countries, unique_industries, unique_sectors = get_unique_entries(entries)

print("Unique Countries:", unique_countries)
print("Unique Industries:", unique_industries)
print("Unique Sectors:", unique_sectors)

Unique Countries: ['France', 'Switzerland', 'Austria', 'Japan', 'Guernsey', 'Argentina', 'Mexico', 'Finland', 'Singapore', 'Thailand', 'Lithuania', 'Turkey', 'Greece', 'Colombia', 'South Africa', 'Italy', 'Poland', 'Zambia', 'Isle of Man', 'Netherlands', 'United Arab Emirates', 'India', 'Spain', 'Mauritius', 'Gibraltar', 'Kenya', 'Azerbaijan', 'New Zealand', 'United States', 'Sweden', 'Costa Rica', 'United Kingdom', 'Norway', 'Kazakhstan', 'Philippines', 'Germany', 'Denmark', 'Malaysia', 'Cyprus', 'Kyrgyzstan', 'Nigeria', 'Georgia', 'China', 'Israel', 'Uruguay', 'Taiwan', 'Vietnam', 'Monaco', 'Bermuda', 'Brazil', 'South Korea', 'Ireland', 'Macau', 'British Virgin Islands', 'Belgium', 'Bahamas', 'Liberia', 'Peru', 'Australia', 'Indonesia', 'Hong Kong', 'Jordan', 'Portugal', 'Chile', 'Mozambique', 'Cayman Islands', 'Jersey', 'Canada', 'Luxembourg']
Unique Industries: ['Apparel Retail', 'Real Estate - Diversified', 'Food Distribution', 'Copper', 'Lodging', 'Financial Conglomerates', 'Comp

In [11]:
print(len(entries))
print(entries[0])

6198
('1', [0.02330457977950573, 0.046704281121492386, -0.023320432752370834, -0.0015635470626875758, 0.011588876135647297, 0.024239933118224144, 0.007173688616603613, 0.03360932692885399, 0.004823413677513599, 0.005326760932803154, -0.016107110306620598, -0.011699850670993328, -0.01280166581273079, -0.047655489295721054, -0.02404969185590744, 0.012357768602669239, -0.034719068557024, -0.03354591131210327, 0.0004483555385377258, 0.004839267116039991, 0.023336287587881088, -0.02847280539572239, -0.01278581190854311, 0.0101858451962471, -0.023066777735948563, -0.018358301371335983, 0.01269861776381731, 0.01991194114089012, -0.006559367291629314, 0.006333455443382263, 0.01667783595621586, -0.008489525876939297, 0.019039999693632126, -0.0368434302508831, 0.002932889387011528, -0.01413335558027029, 0.016051623970270157, 0.016836369410157204, 0.013396169990301132, 0.00672979187220335, -0.0032816652674227953, 0.009210857562720776, -0.05285542085766792, 0.047940850257873535, 0.0006767443846911

In [None]:
def batch_upsert(upsert_data, batch_size=100):
    batch = []
    for element in upsert_data:
        batch.append((element[0], element[1], element[2]))
        
        if len(batch) == batch_size:
            upsert_response = pinecone_index.upsert(vectors=batch)
            print(f"Upsert response for batch: {upsert_response}")
            batch.clear()  # Clear the batch after upserting

    # Upsert any remaining vectors that didn't fill up a full batch
    if batch:
        upsert_response = pinecone_index.upsert(vectors=batch)
        print(f"Upsert response for remaining batch: {upsert_response}")

batch_upsert(entries)