In [1]:
import os
import sys
from tqdm import tqdm
import pandas as pd

from models import Full_Record
from db_conn import engine, session_scope, ping_db, Base

# modify sys.path for it to contain the main repo path so we can import modules such as below
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Now you can import from utils
from utils.data_utils import get_entire_df

In [2]:
ping_db(engine)

True

In [3]:
df = get_entire_df()

Created dataframe with shape: (3172, 26)


In [4]:
# some preprocessing is necessary for proper commits to the database
# this should be moved to our actual preprocessing

df['Transaction Name'] = df['Transaction Name'].astype(str)
df['Transaction Name URL'] = df['Transaction Name URL'].astype(str)
df['Organization Industries'] = df['Organization Industries'].astype(str)
df['Lead Investors'] = df['Lead Investors'].astype(str)
df['Investor Names'] = df['Investor Names'].astype(str)
df['Money Raised'] = pd.to_numeric(df['Money Raised'], errors='coerce')
df['Money Raised Currency'] = df['Money Raised Currency'].astype(str)
df['Money Raised (in USD)'] = pd.to_numeric(df['Money Raised (in USD)'], errors='coerce')
df['Funding Type'] = df['Funding Type'].astype(str)
df['Announced Date'] = pd.to_datetime(df['Announced Date'], errors='coerce')
df['Pre-Money Valuation'] = pd.to_numeric(df['Pre-Money Valuation'], errors='coerce')
df['Pre-Money Valuation Currency'] = df['Pre-Money Valuation Currency'].astype(str)
df['Pre-Money Valuation (in USD)'] = pd.to_numeric(df['Pre-Money Valuation (in USD)'], errors='coerce')
df['Organization Name'] = df['Organization Name'].astype(str)
df['Organization Name URL'] = df['Organization Name URL'].astype(str)
df['Organization Location'] = df['Organization Location'].astype(str)
df['Organization Description'] = df['Organization Description'].astype(str)
df['Organization Website'] = df['Organization Website'].astype(str)
df['Funding Stage'] = df['Funding Stage'].astype(str)
df['Number of Funding Rounds'] = pd.to_numeric(df['Number of Funding Rounds'], errors='coerce')
df['Total Funding Amount'] = pd.to_numeric(df['Total Funding Amount'], errors='coerce')
df['Total Funding Amount Currency'] = df['Total Funding Amount Currency'].astype(str)
df['Total Funding Amount (in USD)'] = pd.to_numeric(df['Total Funding Amount (in USD)'], errors='coerce')
df['Equity Only Funding'] = df['Equity Only Funding'].astype(str)
df['Funding Status'] = df['Funding Status'].astype(str)
df['Number of Investors'] = pd.to_numeric(df['Number of Investors'], errors='coerce')

In [5]:
# drop the existing table and create the database table according to the defined model
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

In [6]:
def upload_all_to_db(df: pd.DataFrame):
    if ping_db(engine):

        # Uncomment line below in case records need to scrapped before new import
        # session.query(Full_Record).delete()
        
        with session_scope() as session:    
            try:
                for _, row in tqdm(df.iterrows(), total=df.shape[0], desc='Importing records to database...'):
                    record = Full_Record(
                        transaction_name=row['Transaction Name'],
                        transaction_name_url=row['Transaction Name URL'],
                        organization_industries=row['Organization Industries'],
                        lead_investors=row['Lead Investors'],
                        investor_names=row['Investor Names'],
                        money_raised=row['Money Raised'],
                        money_raised_currency=row['Money Raised Currency'],
                        money_raised_in_usd=row['Money Raised (in USD)'],
                        funding_type=row['Funding Type'],
                        announced_date=pd.to_datetime(row['Announced Date']),
                        pre_money_valuation=row['Pre-Money Valuation'],
                        pre_money_valuation_currency=row['Pre-Money Valuation Currency'],
                        pre_money_valuation_in_usd=row['Pre-Money Valuation (in USD)'],
                        organization_name=row['Organization Name'],
                        organization_name_url=row['Organization Name URL'],
                        organization_location=row['Organization Location'],
                        organization_description=row['Organization Description'],
                        organization_website=row['Organization Website'],
                        funding_stage=row['Funding Stage'],
                        number_of_funding_rounds=row['Number of Funding Rounds'],
                        total_funding_amount=row['Total Funding Amount'],
                        total_funding_amount_currency=row['Total Funding Amount Currency'],
                        total_funding_amount_in_usd=row['Total Funding Amount (in USD)'],
                        equity_only_funding=row['Equity Only Funding'],
                        funding_status=row['Funding Status'],
                        number_of_investors=row['Number of Investors']
                    )
                    session.add(record)
                session.commit()
                print(f"Successfully commited {df.shape[0]} records to the database.")

            except Exception as e:
                session.rollback()
                print(f"""
                An error occurred: {e};
                Exception type: {type(e).__name__}
                Exception occured at 
                {row}
                """)

In [7]:
upload_all_to_db(df)

Importing records to database...: 100%|██████████| 3172/3172 [00:00<00:00, 15572.15it/s]


Successfully commited 3172 records to the database.


In [9]:
# Example query
with session_scope() as session:
    first_entry = session.query(Full_Record).first()

    # we can now use this object notation for further processing of our requests
    print(first_entry.id)
    print(first_entry.transaction_name)

1
Seed Round - Flagright
