In [1]:
import sqlite3
import pandas as pd

In [10]:
# Define the path to the primary database
primary_db_path = r'C:\Users\Admin\PycharmProjects\public-comps\backend\scraping\USA\us_company_info.db'

# Connect to the primary database and load the data
conn = sqlite3.connect(primary_db_path)
primary_df = pd.read_sql_query("SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL", conn)
conn.close()

print(len(primary_df))

11837


In [11]:
conn = sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\scraping\DE\de_company_info.db')
deutsche_boerse_df = pd.read_sql_query("SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL", conn)
print(len(deutsche_boerse_df))
conn.close()

conn = sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\scraping\UK\uk_company_info.db')
lse_df = pd.read_sql_query("SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL", conn)
print(len(lse_df))
conn.close()


3553
1803


In [2]:
def create_merged_database(db_path):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('''
        CREATE TABLE IF NOT EXISTS companies (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            company_sec_cik TEXT,
            company_name TEXT,
            company_isin TEXT,
            company_ticker TEXT,
            company_exchange TEXT,
            country TEXT,
            industry TEXT,
            sector TEXT,
            longBusinessSummary TEXT,
            embedding BLOB,
            UNIQUE(company_name, company_ticker, company_isin)
        )
    ''')
    conn.commit()
    conn.close()

#create_merged_database('merged_company_info.db')

In [3]:
def insert_data(df, conn):
    c = conn.cursor()
    for _, row in df.iterrows():
        c.execute('''
            INSERT OR IGNORE INTO companies (
                company_sec_cik, company_name, company_isin, company_ticker, company_exchange,
                country, industry, sector, longBusinessSummary, embedding
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            row['company_sec_cik'], row['company_name'], row['company_isin'], row['company_ticker'], 
            row['company_exchange'], row['country'], row['industry'], row['sector'], 
            row['longBusinessSummary'], row.get('embedding')
        ))
    conn.commit()

In [10]:
def load_data(db_path):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL", conn)
    conn.close()
    return df

In [12]:
# Insert data into merged database
conn = sqlite3.connect("merged_company_info.db")
insert_data(primary_df, conn)
insert_data(deutsche_boerse_df, conn)
insert_data(lse_df, conn)
conn.close()

In [13]:
conn = sqlite3.connect("merged_company_info.db")
df = pd.read_sql_query("SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL", conn)
print(len(df))
conn.close()

17191


In [5]:
# Create empty merged database
import sqlite3
import pandas as pd

def remove_duplicates(df):
    # Create a helper column to prioritize non-OTC exchanges
    df['is_otc'] = df['company_exchange'].apply(lambda x: 1 if x == 'OTC' else 0)
    
    # Sort by company_name, and the helper column
    df = df.sort_values(by=['company_name', 'is_otc'])
    
    # Drop duplicates, keeping the first occurrence (non-OTC will be first due to sorting)
    df = df.drop_duplicates(subset=['company_name'], keep='first')
    
    # Drop the helper column
    df = df.drop(columns=['is_otc'])
    
    return df

def insert_data(df, conn):
    c = conn.cursor()
    for _, row in df.iterrows():
        c.execute('''
            INSERT OR IGNORE INTO companies (
                company_sec_cik, company_name, company_isin, company_ticker, company_exchange,
                country, industry, sector, longBusinessSummary, embedding
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            row['company_sec_cik'], row['company_name'], row['company_isin'], row['company_ticker'], 
            row['company_exchange'], row['country'], row['industry'], row['sector'], 
            row['longBusinessSummary'], row.get('embedding')
        ))
    conn.commit()

def load_data(db_path):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL", conn)
    conn.close()
    return df

def filter_invalid_names(df):
    # Remove entries where company_name contains more than 2 adjacent spaces, or any of ",", "-", "/"
    df = df[~df['company_name'].str.contains(r'\s{3,}')]
    df = df[~df['company_name'].str.contains(r'[,\-\/]')]
    return df

# Load data from various sources with the specified conditions
conditions = """
    longBusinessSummary != 'N/A' AND
    country != 'N/A' AND
    industry != 'N/A' AND
    sector != 'N/A'
"""

primary_df = pd.read_sql_query(f"SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL AND {conditions}", sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\scraping\USA\us_company_info.db'))
deutsche_boerse_df = pd.read_sql_query(f"SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL AND {conditions}", sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\scraping\DE\de_company_info.db'))
lse_df = pd.read_sql_query(f"SELECT * FROM companies WHERE longBusinessSummary IS NOT NULL AND {conditions}", sqlite3.connect(r'C:\Users\Admin\PycharmProjects\public-comps\backend\scraping\UK\uk_company_info.db'))

# Combine all dataframes
combined_df = pd.concat([primary_df, deutsche_boerse_df, lse_df])

print(f"Total records before filtering: {len(combined_df)}")

# Filter out invalid company names
combined_df = filter_invalid_names(combined_df)

print(f"Total records after filtering invalid names: {len(combined_df)}")

# Remove duplicates within the combined dataframe, prioritizing non-OTC exchanges
combined_df = remove_duplicates(combined_df)

print(f"Total records after deduplication: {len(combined_df)}")

# Creating empty merged database 
create_merged_database("merged_company_deduplicated.db")

# Insert data into merged database
conn = sqlite3.connect("merged_company_deduplicated.db")
insert_data(combined_df, conn)
conn.close()

Total records before filtering: 12391
Total records after filtering invalid names: 8264
Total records after deduplication: 6198
