In [129]:
# Import necessary libraries
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Modify sys.path for it to contain the main repo path so we can import modules
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from utils.data_utils import get_entire_df

# Load the data
data = get_entire_df()

# Display general information about the dataset
def dataset_overview(df):
    print("\nDataset Overview:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())

dataset_overview(data)


Created dataframe with shape: (3172, 26)

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3172 entries, 0 to 3171
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Transaction Name               3172 non-null   object 
 1   Transaction Name URL           3172 non-null   object 
 2   Organization Industries        3128 non-null   object 
 3   Lead Investors                 1557 non-null   object 
 4   Investor Names                 2603 non-null   object 
 5   Money Raised                   2208 non-null   float64
 6   Money Raised Currency          2219 non-null   object 
 7   Money Raised (in USD)          2208 non-null   float64
 8   Funding Type                   3172 non-null   object 
 9   Announced Date                 3172 non-null   object 
 10  Pre-Money Valuation            230 non-null    float64
 11  Pre-Money Valuation Currency   230 non-null    o

In [130]:
#Check for duplicates
def check_duplicates(df):
    print("\nNumber of Duplicates:", df.duplicated().sum())

check_duplicates(data)


Number of Duplicates: 0


In [None]:
print(data.head())

<bound method NDFrame.head of                      Transaction Name  \
0              Seed Round - Flagright   
1                Seed Round - aboutuz   
2             Seed Round - Kubermatic   
3             Seed Round - MYNE Homes   
4     Pre Seed Round - Emulate Energy   
...                               ...   
3167         Seed Round - Sunmaxx PVT   
3168         Pre Seed Round - Dema.ai   
3169         Pre Seed Round - &Hamlet   
3170            Seed Round - eleQtron   
3171         Seed Round - AirForestry   

                                                            Transaction Name URL  \
0              https://www.crunchbase.com/funding_round/flagright-seed--82849d85   
1                https://www.crunchbase.com/funding_round/aboutuz-seed--9c881e5a   
2             https://www.crunchbase.com/funding_round/kubermatic-seed--286b6112   
3             https://www.crunchbase.com/funding_round/myne-homes-seed--3bf4b676   
4     https://www.crunchbase.com/funding_round/emulate-en

## After a review of the data we can see the follwing:
1. "Organization Location" consists of City, Region, Country, Continent in the same sell. This has to be splitted up into separate columns 
2. "Organization Industries" and "Investor Names" includes many diffirent values in each cell. These has to be splitted up and put in separate tables. 
3. The following columns contain both numeric values and NaN values. "Money Raised", "Money Raised (in USD)", "Pre-Money Valuation", "Pre-Money Valuation (in USD)", "Total Funding Amount" "Total Funding Amount (in USD)"

In [133]:
# 1. Split "Organization Location" into "City", "Region", "Country", "Continent"
def split_location_columns(df):
    if 'Organization Location' in df.columns:
        df[['City', 'Region', 'Country', 'Continent']] = df['Organization Location'].str.split(',', expand=True)
        print("\nLocation columns added (City, Region, Country, Continent).")
    else:
        print("\n'Organization Location' column not found.")

split_location_columns(data)

#Drop the original "Organization Location" column
data.drop('Organization Location', axis=1, inplace=True)



Location columns added (City, Region, Country, Continent).


In [135]:
# 2. Creating separate tables for "Organization Industries" and "Investor Names"

# Split "Organization Industries" into separate rows (normalization)
organization_industries_table = data[['Transaction Name', 'Organization Industries']].copy()
organization_industries_table = organization_industries_table.assign(
    Organization_Industry=organization_industries_table['Organization Industries'].str.split(', ')
).explode('Organization_Industry').drop(columns=['Organization Industries'])

# Split "Investor Names" into separate rows (normalization)
investor_names_table = data[['Transaction Name', 'Investor Names']].copy()
investor_names_table = investor_names_table.assign(
    Investor_Name=investor_names_table['Investor Names'].str.split(', ')
).explode('Investor_Name').drop(columns=['Investor Names'])

# Display the first few rows of the new tables
print("\nOrganization Industries Table:")
print(organization_industries_table.head())
print("\nInvestor Names Table:")
print(investor_names_table.head())



Organization Industries Table:
         Transaction Name   Organization_Industry
0  Seed Round - Flagright              Compliance
0  Seed Round - Flagright      Financial Services
0  Seed Round - Flagright                 FinTech
0  Seed Round - Flagright         Fraud Detection
0  Seed Round - Flagright  Information Technology

Investor Names Table:
         Transaction Name        Investor_Name
0  Seed Round - Flagright   Charles Delingpole
0  Seed Round - Flagright     Donald Bringmann
0  Seed Round - Flagright     Erik Muttersbach
0  Seed Round - Flagright  Four Cities Capital
0  Seed Round - Flagright    Fredrik Thomassen


In [136]:
# After creating the new tables, we can drop the original columns from the main DataFrame

def drop_original_columns(df):
    df.drop(columns=['Organization Industries', 'Investor Names'], inplace=True)
    print("\nOriginal columns dropped.")

drop_original_columns(data)


Original columns dropped.


In [137]:
# 3. Change from NaN to 0 for the numeric columns: "Money Raised", "Money Raised (in USD)", "Pre-Money Valuation", "Pre-Money Valuation (in USD)", "Total Funding Amount" "Total Funding Amount (in USD)"

def fill_na_with_zero(df):
    numeric_columns = ['Money Raised', 'Money Raised (in USD)', 'Pre-Money Valuation', 'Pre-Money Valuation (in USD)', 'Total Funding Amount', 'Total Funding Amount (in USD)']
    df[numeric_columns] = df[numeric_columns].fillna(0)
    print("\nMissing values filled with 0 for numeric columns.")

fill_na_with_zero(data)

#Display the first few rows of the cleaned data focus on the changed columns
print("\nCleaned Data:")
print(data.head())


Missing values filled with 0 for numeric columns.

Cleaned Data:
                  Transaction Name  \
0           Seed Round - Flagright   
1             Seed Round - aboutuz   
2          Seed Round - Kubermatic   
3          Seed Round - MYNE Homes   
4  Pre Seed Round - Emulate Energy   

                                                         Transaction Name URL  \
0           https://www.crunchbase.com/funding_round/flagright-seed--82849d85   
1             https://www.crunchbase.com/funding_round/aboutuz-seed--9c881e5a   
2          https://www.crunchbase.com/funding_round/kubermatic-seed--286b6112   
3          https://www.crunchbase.com/funding_round/myne-homes-seed--3bf4b676   
4  https://www.crunchbase.com/funding_round/emulate-energy-pre-seed--71a33333   

        Lead Investors  Money Raised Money Raised Currency  \
0    Moonfire Ventures     2800000.0                   USD   
1        FasterCapital      632000.0                   USD   
2  NetApp Excellerator          

## Suggestion on Star Schema

### FactFunding
| Column                     | Description                        |
|----------------------------|------------------------------------|
| Transaction ID (PK)         | Unique identifier for the transaction |
| Organization ID (FK)        | Foreign key referencing DimensionOrganization |
| Lead Investor ID (FK)       | Foreign key referencing DimensionInvestor |
| Money Raised                | Amount of money raised in local currency |
| Money Raised (in USD)       | Amount of money raised in USD |
| Funding Type                | Type of funding (e.g., Seed, Pre-Seed) |
| Announced Date              | Date when the funding was announced |
| Funding Stage               | Stage of funding (e.g., Seed, Series A) |
| Number of Funding Rounds    | Number of funding rounds for the organization |
| Total Funding Amount        | Total amount of funding raised in local currency |
| Total Funding Amount (in USD) | Total amount of funding raised in USD |
| Equity Only                 | Whether it was equity-only funding (Yes/No) |

---

### DimensionOrganization
| Column                     | Description                        |
|----------------------------|------------------------------------|
| Organization ID (PK)        | Unique identifier for the organization |
| Organization Name           | Name of the organization           |
| City                        | City where the organization is located |
| Region                      | Region where the organization is located |
| Country                     | Country where the organization is located |
| Continent                   | Continent where the organization is located |
| Organization URL            | URL of the organization's Crunchbase profile |
| Organization Description     | Brief description of the organization |
| Organization Website        | Website of the organization        |

---

### DimensionInvestor
| Column                     | Description                        |
|----------------------------|------------------------------------|
| Investor ID (PK)            | Unique identifier for the investor |
| Investor Name               | Name of the investor               |
| Lead Investor               | Whether the investor is a lead investor (Yes/No) |
| Transaction ID (FK)         | Foreign key referencing FactFunding |

---

### DimensionIndustry
| Column                     | Description                        |
|----------------------------|------------------------------------|
| Industry ID (PK)            | Unique identifier for the industry |
| Industry Name               | Name of the industry               |
| Transaction ID (FK)         | Foreign key referencing FactFunding |
