# Merging Orbis Miscellaneous Datasets

## This notebook merges the Orbis miscellaneous datasets (Industry, Legal, Location data, ownership, etc.) into a single dataset.

## Technical Notes:

- This notebook uses `dask` instead of `pandas` to handle large datasets that do not fit into memory. 

- Ownership data is merged separately to avoid memory issues, as it can be quite large.

- The `orbis-info-merged.csv` file consists of the following files: "Industries&Activities_Final_Dataset.csv", "legal_combined_new.csv", "Location_data.csv"
 
- The `orbis-ownership-merged.csv` file consists of the "Ownership_combined_Part*.csv" files.

- There are issues with mixed types so read every column as a string.

## How to use:

- Make sure you have the required libraries installed. You can install them using pip:
```bash
pip install "dask[complete]"
```

- Change the `dataset_dir` variable to point to the directory where your Orbis datasets are stored.

- Run the notebook. The merged dataset will be saved as a CSV file in the specified output directory.




In [3]:
import dask.dataframe as dd
import pandas as pd
import os
import gc


In [4]:
dataset_dir = os.path.join(os.getcwd(), "../unmerged-datasets/orbis-other")
out_dir = os.path.join(os.getcwd(), "../merged-datasets")

In [None]:
index_cols = ["Orbis ID number", "BvD ID number"]

merged_info_df = None

info_files = [
    "Industries&Activities_Final_Dataset.csv",
    "legal_combined_new.csv",
    "Location_data.csv"
]

for filename in info_files:
    df = dd.read_csv(os.path.join(dataset_dir, filename), dtype=str)
    # This seems to be a redundant column that appears in some files
    # but not others, so we drop it if it exists.
    if "Unnamed: 0" in df.columns:
        df = df.drop("Unnamed: 0", axis=1)
    df["BvD ID number"] = df["BvD ID number"].str.upper()
    if merged_info_df is None:
        merged_info_df = df
    else:
        # Merge the new dataframe with the existing merged dataframe
        merged_info_df = dd.merge(merged_info_df, df, on=index_cols, how='outer', suffixes=('', '_dup'))
        dup_cols = [col for col in merged_info_df.columns if col.endswith('_dup')]
        if dup_cols:
            merged_info_df = merged_info_df.drop(columns=dup_cols)

        # Clean up memory
        merged_info_df = merged_info_df.repartition(partition_size="100MB")
        del df
        gc.collect()

if merged_info_df is not None:
    print(merged_info_df.columns)
    print("Number of columns in merged info dataframe:", len(merged_info_df.columns))
    print(f"Number of rows in merged info dataframe: {merged_info_df.shape[0].compute()}")
else:
    print("No info files found in the dataset directory.")

Index(['Company name Latin alphabet', 'BvD ID number', 'Orbis ID number',
       'Ticker symbol', 'Primary code(s) in national industry classification',
       'US SIC, core code (3 digits)', 'BvD sectors', 'Peer Group Name',
       'Peer Group Size', 'Main customers', 'Status', 'Status date',
       'Status updated date', 'Date of incorporation', 'Address Line 1',
       'Latitude', 'Longitude', 'City', 'Country ISO code', 'Full Address'],
      dtype='object')
Number of columns in merged info dataframe: 20
Number of rows in merged info dataframe: 1272454


In [30]:
merged_info_df.to_csv(
    os.path.join(out_dir, "orbis-info-merged.csv"),
    single_file=True,
    index=False,
)

['/Users/koacow/repos/glob-s-RA/firms-and-disaster/merged-datasets/orbis-info-merged.csv']

In [None]:
merged_ownership_df = None

index_cols = ["BvD ID number", "Orbis ID number"]
dtypes = {
    "Orbis ID number": "str",
    "BvD ID number": "str",
    "GUO - Orbis ID number": "str",
    "GUO - BvD ID number": "str",
    "ISH - Orbis ID number": "str",
    "ISH - BvD ID number": "str",
    'GUO - Operating revenue (Turnover)\nm USD': 'str',
    'GUO - Total %': 'str',
    'GUO - Total assets\nm USD': 'str',
    'GUO - Direct %': 'str',
    'GUO - Number of employees': 'str',
    'DUO - Total assets\nm USD': 'str',
    'No of subsidiaries': 'str',
    'Number of publications': 'str',
    'DUO - Operating revenue (Turnover)\nm USD': 'str',
    'DUO - Orbis ID number': 'str',
    'DUO - Total %': 'str',
    'DUO - Direct %': 'str',
    'DUO - Number of employees': 'str',
    'HQ - HeadquartersCity': 'str',
    'HQ - HeadquartersCountry ISO code': 'str',
    'HQ - HeadquartersName': 'str'
}

for filename in os.listdir(dataset_dir):
    if filename.startswith("Ownership_") and filename.endswith(".csv"):
        df = dd.read_csv(os.path.join(dataset_dir, filename), dtype=str)

        # These seem to be redundant columns that appear in some files but not others,
        # so we drop them if they exist.
        if "Unnamed: 0" in df.columns:
            df = df.drop("Unnamed: 0", axis=1)
        if "Unnamed: 0.1" in df.columns:
            df = df.drop("Unnamed: 0.1", axis=1)
        df["BvD ID number"] = df["BvD ID number"].str.upper()
        if merged_ownership_df is None:
            merged_ownership_df = df
        else:
            merged_ownership_df = dd.merge(merged_ownership_df, df, on=index_cols, how='outer', suffixes=('', '_dup'))
            # Remove duplicate columns that were created during the merge
            dup_cols = [col for col in merged_ownership_df.columns if col.endswith('_dup')]
            if dup_cols:
                merged_ownership_df = merged_ownership_df.drop(columns=dup_cols)

            # Clean up memory
            merged_ownership_df = merged_ownership_df.repartition(partition_size="100MB")
            del df
            gc.collect()
if merged_ownership_df is not None:
    print(merged_ownership_df.columns)
    print("Number of columns in merged ownership dataframe:", len(merged_ownership_df.columns))
    print(f"Number of rows in merged ownership dataframe: {merged_ownership_df.shape[0].compute()}")
else:
    print("No ownership files found in the dataset directory.")

Index(['Company name Latin alphabet', 'BvD ID number', 'Orbis ID number',
       'Ticker symbol', 'No of companies in corporate group', 'Entity type',
       'ISH - BvD ID number', 'ISH - Orbis ID number', 'GUO - Name',
       'GUO - BvD ID number', 'GUO - Orbis ID number', 'GUO - Ticker symbol',
       'GUO - Country ISO code', 'GUO - City', 'GUO - Type',
       'GUO - US SIC, Core code', 'GUO - Direct %', 'GUO - Total %',
       'GUO - Operating revenue (Turnover)\nm USD',
       'GUO - Total assets\nm USD', 'GUO - Number of employees', 'DUO - Name',
       'DUO - BvD ID number', 'DUO - Orbis ID number',
       'DUO - Country ISO code', 'DUO - City', 'DUO - Type',
       'DUO - US SIC, Core code', 'DUO - Direct %', 'DUO - Total %',
       'DUO - Operating revenue (Turnover)\nm USD',
       'DUO - Total assets\nm USD', 'DUO - Number of employees',
       'No of subsidiaries', 'Number of publications', 'HQ - HeadquartersName',
       'HQ - HeadquartersBvD ID number', 'HQ - Headquarters

In [13]:
# Count number of rows with null values in 'BvD ID number' and 'Orbis ID number'
merged_ownership_df = merged_ownership_df.dropna(subset=index_cols, how='all')
merged_ownership_df.shape[0].compute()

49573020

In [None]:
merged_ownership_df.to_csv(
    os.path.join(out_dir, "orbis-ownership-merged.csv"),
    single_file=True,
    index=False,
)