In [None]:
from dotenv import load_dotenv
import os
from pinecone import Pinecone
load_dotenv(dotenv_path=r"C:\Users\Admin\PycharmProjects\public-comps\.env")

In [None]:
# Instantiating pinecone index via the pinecone client
pinecone_index = Pinecone(
    api_key = os.environ.get("PINECONE_API_KEY")
).Index(host=os.environ.get("PINECONE_INDEX_HOST"))


In [None]:
import sqlite3
import json

def fetch_entries_with_embeddings():
    conn = sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\merging\merged_company_deduplicated.db')
    c = conn.cursor()
    c.execute('''
              SELECT * FROM companies
              WHERE embedding IS NOT NULL
              ''')
    entries = c.fetchall()
    conn.close()
    return entries

entries = fetch_entries_with_embeddings()

In [None]:
import json
import re
import sqlite3
from collections import defaultdict

def normalize_hyphens(text):
    # Replace different types of dashes with a standard hyphen
    text = re.sub(r'[–—−]', '-', text)  # Normalize different hyphens/dashes to a standard hyphen
    return re.sub(r'\s*-\s*', ' - ', text)  # Ensure spaces around hyphens

def prepare_and_deduplicate_data(entries):
    seen_company_tickers = set()
    upsert_data = []

    for entry in entries:
        
        company_id = str(entry[0])
        company_sec_cik = entry[1] if entry[1] is not None else 'N/A'
        company_name = entry[2] if entry[2] is not None else 'N/A'
        company_isin = entry[3] if entry[3] is not None else 'N/A'
        company_ticker = entry[4] if entry[4] is not None else 'N/A'
        company_exchange = entry[5] if entry[5] is not None else 'N/A'
        country = entry[6] if entry[6] is not None else 'N/A'
        industry = entry[7] if entry[7] is not None else 'N/A'
        sector = entry[8] if entry[8] is not None else 'N/A'
        long_business_summary = entry[9] if entry[9] is not None else 'N/A'

        # Normalize hyphens in the industry and sector fields
        industry = normalize_hyphens(industry)
        sector = normalize_hyphens(sector)
        
        seen_company_tickers.add(company_ticker)
        
        metadata = {
            'company_sec_cik': company_sec_cik,
            'company_name': company_name,
            'company_isin': company_isin,
            'company_ticker': company_ticker,
            'company_exchange': company_exchange,
            'country': country,
            'industry': industry,
            'sector': sector,
            'long_business_summary': long_business_summary
        }

        embedding = json.loads(entry[10])
        upsert_data.append((company_id, embedding, metadata))
    
    return upsert_data

In [None]:
entries = prepare_and_deduplicate_data(entries)

In [None]:
def get_unique_entries(prepared_data):
    unique_countries = set()
    unique_industries = set()
    unique_sectors = set()
    
    for entry in prepared_data:
        metadata = entry[2]
        
        unique_countries.add(metadata['country'])
        unique_industries.add(metadata['industry'])
        unique_sectors.add(metadata['sector'])
    
    return list(unique_countries), list(unique_industries), list(unique_sectors)


unique_countries, unique_industries, unique_sectors = get_unique_entries(entries)

print("Unique Countries:", unique_countries)
print("Unique Industries:", unique_industries)
print("Unique Sectors:", unique_sectors)

In [None]:
print(len(entries))
print(entries[0])

In [None]:
def batch_upsert(upsert_data, batch_size=100):
    batch = []
    for element in upsert_data:
        batch.append((element[0], element[1], element[2]))
        
        if len(batch) == batch_size:
            upsert_response = pinecone_index.upsert(vectors=batch)
            print(f"Upsert response for batch: {upsert_response}")
            batch.clear()  # Clear the batch after upserting

    # Upsert any remaining vectors that didn't fill up a full batch
    if batch:
        upsert_response = pinecone_index.upsert(vectors=batch)
        print(f"Upsert response for remaining batch: {upsert_response}")

batch_upsert(entries)