# Merging Orbis Miscellaneous Datasets

## This notebook merges the Orbis miscellaneous datasets (Industry, Legal, Location data, ownership, etc.) into a single dataset.

## Technical Notes:

- This notebook uses `dask` instead of `pandas` to handle large datasets that do not fit into memory. 

## How to use:

- Make sure you have the required libraries installed. You can install them using pip:
```bash
pip install "dask[complete]"
```

- Change the `dataset_dir` variable to point to the directory where your Orbis datasets are stored.

- Change the `out_dir` variable to point to the directory where you want to save the merged dataset.

- Run the notebook. The merged dataset will be saved as a CSV file in the specified output directory.




In [4]:
import dask.dataframe as dd
import pandas as pd
import os

In [5]:
script_dir = os.path.dirname(os.path.abspath(os.getcwd()))
dataset_dir = os.path.join(script_dir, "unmerged-datasets/orbis-other")

dfs = []

for filename in os.listdir(dataset_dir):
    if filename.endswith('.xlsx'):
        df = pd.read_excel(os.path.join(dataset_dir, filename))
        df = dd.from_pandas(df, npartitions=1)
        print(filename)
        print(df.columns)
        dfs.append(df)



Location_data.xlsx
Index(['Company name Latin alphabet', 'Orbis ID number', 'Address Line 1',
       'Latitude', 'Longitude', 'BvD ID number', 'City', 'Country ISO code',
       'Full Address'],
      dtype='object')
Ownership_combined_Part_1.xlsx
Index(['Unnamed: 0', 'Company name Latin alphabet', 'BvD ID number',
       'Orbis ID number', 'Ticker symbol',
       'No of companies in corporate group', 'Entity type',
       'ISH - BvD ID number', 'ISH - Orbis ID number'],
      dtype='object')
Ownership_combined_Part_3.xlsx
Index(['Unnamed: 0', 'Company name Latin alphabet', 'BvD ID number',
       'Orbis ID number', 'GUO - US SIC, Core code', 'GUO - Direct %',
       'GUO - Total %', 'GUO - Operating revenue (Turnover)\nm USD',
       'GUO - Ticker symbol', 'GUO - Total assets\nm USD',
       'GUO - Number of employees'],
      dtype='object')
Ownership_combined_Part_2.xlsx
Index(['Unnamed: 0', 'Company name Latin alphabet', 'BvD ID number',
       'Orbis ID number', 'GUO - Name', 'GUO

In [None]:
index_cols = ["Orbis ID number", "BvD ID number"]

merged_df = None

for df in dfs:
    for col in index_cols:
        if col not in df.columns:
            print(f"Column '{col}' not found in {df.columns}")
            break
        else:
            df[col] = df[col].astype(str)
            df[col] = df[col].str.upper()
        
for df in dfs:
    if merged_df is None:
        merged_df = df
    else:
        merged_df = dd.merge(merged_df, df, on=index_cols, how='outer', suffixes=('', '_y'))
        merged_df = merged_df.drop([col for col in merged_df.columns if col.endswith('_y')], axis=1)

print(merged_df.columns)
print("Number of columns in merged dataframe:", len(merged_df.columns)) 
print("Number of rows in merged dataframe:", merged_df.shape[0].compute())
