# Merge aggregated ETF and Fund ownership

In [None]:
import os
import logging
from typing import List, Optional
import pandas as pd


def aggregation_function(input_file: str, output_file: str, fund_type: str = "etf") -> None:
    """
    Aggregates the input CSV by 'stock_RIC' and 'date', summing specified columns, and writes to CSV.

    Parameters:
    - input_file: Path to the input CSV file.
    - output_file: Path to the output CSV file.
    - fund_type: Type of fund for renaming columns ('etf', 'mutual fund', 'index fund', 'active fund').
    """
    try:
        logging.info(f"Aggregating data from {input_file} for fund type: {fund_type}")
        df = pd.read_csv(input_file, index_col=False)

        # Group by 'stock_RIC' and 'date' and sum the relevant columns
        grouped_df = df.groupby(['stock_RIC', 'date'])[['stock_value_held', 'percent_of_traded_shares']].sum().reset_index()

        # Define renaming mappings based on fund type
        renaming_mappings = {
            "etf": {
                'stock_value_held': 'ETF_stock_value_held',
                'percent_of_traded_shares': 'ETF_percent_of_traded_shares'
            },
            "mutual fund": {
                'stock_value_held': 'FUND_stock_value_held',
                'percent_of_traded_shares': 'FUND_percent_of_traded_shares'
            },
            "index fund": {
                'stock_value_held': 'INDEX_FUND_stock_value_held',
                'percent_of_traded_shares': 'INDEX_FUND_percent_of_traded_shares'
            },
            "active fund": {
                'stock_value_held': 'ACTIVE_FUND_stock_value_held',
                'percent_of_traded_shares': 'ACTIVE_FUND_percent_of_traded_shares'
            }
        }

        # Apply renaming if the fund type is recognized
        if fund_type in renaming_mappings:
            grouped_df.rename(columns=renaming_mappings[fund_type], inplace=True)
            logging.info(f"Renamed columns for fund type: {fund_type}")
        else:
            logging.warning(f"Unknown fund type '{fund_type}'. Columns will not be renamed.")

        # Save the aggregated DataFrame to the output CSV
        grouped_df.to_csv(output_file, index=False)
        logging.info(f"Aggregated data saved to {output_file}")

    except Exception as e:
        logging.error(f"Error in aggregation_function for {input_file}: {e}")

def merge_csv_files(file1: str, file2: str, output_file: str, columns_to_add: List[str]) -> None:
    """
    Merges two CSV files on 'date' and 'stock_RIC', adds specified columns, and writes the result to a new CSV.

    Parameters:
    - file1: Path to the first input CSV file.
    - file2: Path to the second input CSV file.
    - output_file: Path to the output CSV file.
    - columns_to_add: List of column names to add from the second CSV.
    """
    try:
        logging.info(f"Merging {file1} with {file2}, adding columns: {columns_to_add}")
        df1 = pd.read_csv(file1)
        df2 = pd.read_csv(file2)

        key_columns = ['date', 'stock_RIC']

        # Ensure 'date' columns are datetime objects
        df1['date'] = pd.to_datetime(df1['date'], errors='coerce')
        df2['date'] = pd.to_datetime(df2['date'], errors='coerce')

        # Select only the key columns and columns to add from df2
        cols_to_keep = key_columns + columns_to_add
        df2_subset = df2[cols_to_keep]

        # Merge df1 with df2_subset on 'date' and 'stock_RIC'
        merged_df = pd.merge(df1, df2_subset, on=key_columns, how='left')

        # Fill NA values in the added columns with 'NA'
        for col in columns_to_add:
            merged_df[col].fillna('NA', inplace=True)

        # Save the merged DataFrame to the output CSV
        merged_df.to_csv(output_file, index=False)
        logging.info(f"Merged data saved to {output_file}")

    except Exception as e:
        logging.error(f"Error merging {file1} and {file2}: {e}")

def apply_merge_function(subset: str = "none") -> None:

    try:
        logging.info(f"Applying merge function with subset: {subset}")

        # Define file paths based on subset
        base_dir = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data'
        aggregated_dir = os.path.join(base_dir, 'fund_holdings_data', 'aggregated_data')
        final_dir = os.path.join(base_dir, 'fund_holdings_data')

        if subset == "europe":
            aggregated_files = {
                "etf": os.path.join(aggregated_dir, "etf_aggregated_data_europe.csv"),
                "mutual_fund": os.path.join(aggregated_dir, "fund_aggregated_data_europe.csv"),
                "index_fund": os.path.join(aggregated_dir, "index_fund_aggregated_data_europe.csv"),
                "active_fund": os.path.join(aggregated_dir, "active_fund_aggregated_data_europe.csv")
            }
            output_file = os.path.join(final_dir, "formatted_final_europe_van.csv")
        else:
            aggregated_files = {
                "etf": os.path.join(aggregated_dir, "etf_aggregated_data.csv"),
                "mutual_fund": os.path.join(aggregated_dir, "fund_aggregated_data.csv"),
                "index_fund": os.path.join(aggregated_dir, "index_fund_aggregated_data.csv"),
                "active_fund": os.path.join(aggregated_dir, "active_fund_aggregated_data.csv")
            }
            output_file = os.path.join(final_dir, "formatted_final_van.csv")

        # Paths to index members and stock level data
        formatted_index_member = os.path.join(base_dir, 'index_constituents_data', 'formated_constituents.csv')
        file_m_stock = os.path.join(base_dir, 'stock_level_data', 'm_stock_level_data.csv')

        merge_csv_files(
            file1=formatted_index_member,
            file2=aggregated_files["etf"],
            output_file=output_file,
            columns_to_add=['ETF_stock_value_held', 'ETF_percent_of_traded_shares']
        )

        merge_csv_files(
            file1=output_file,
            file2=aggregated_files["mutual_fund"],
            output_file=output_file,
            columns_to_add=["FUND_stock_value_held", "FUND_percent_of_traded_shares"]
        )

        merge_csv_files(
            file1=output_file,
            file2=aggregated_files["index_fund"],
            output_file=output_file,
            columns_to_add=["INDEX_FUND_stock_value_held", "INDEX_FUND_percent_of_traded_shares"]
        )

        merge_csv_files(
            file1=output_file,
            file2=aggregated_files["active_fund"],
            output_file=output_file,
            columns_to_add=["ACTIVE_FUND_stock_value_held", "ACTIVE_FUND_percent_of_traded_shares"]
        )

        merge_csv_files(
            file1=output_file,
            file2=file_m_stock,
            output_file=output_file,
            columns_to_add=['market_cap']
        )

        logging.info(f"Final merged data saved to {output_file}")

    except Exception as e:
        logging.error(f"Error in apply_merge_function: {e}")

def main():
    # Configuration dictionary for file paths and parameters
    config = {
        "base_dir": '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data',
        "input_file": os.path.join('/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data', 'merged_data.csv'),
        "filtered_dir": os.path.join('/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data', 'fund_type_filtered'),
        "aggregated_dir": os.path.join('/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data', 'aggregated_data'),
        "final_dir": os.path.join('/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data'),
        "subset": "europe",  # Change to "none" if no subset filtering is needed
        "fund_types": ["etf", "mutual fund", "index fund", "active fund"]
    }

    aggregation_tasks = [
        {"type": "etf", "filtered": "none"},
        {"type": "etf", "filtered": "europe"},
        {"type": "mutual fund", "filtered": "none"},
        {"type": "mutual fund", "filtered": "europe"},
        {"type": "index fund", "filtered": "none"},
        {"type": "index fund", "filtered": "europe"},
        {"type": "active fund", "filtered": "none"},
        {"type": "active fund", "filtered": "europe"},
    ]

    # Perform aggregation for each task
    for task in aggregation_tasks:
        fund_type = task["type"]
        subset = task["filtered"]

        # Determine input and output file paths based on subset
        if fund_type == "etf":
            input_filename = "etf_data_van.csv" if subset == "none" else "etf_data_europe_van.csv"
            output_filename = f"etf_aggregated_data{'_europe' if subset == 'europe' else ''}.csv"
        elif fund_type == "mutual fund":
            input_filename = "fund_data_van.csv" if subset == "none" else "fund_data_europe_van.csv"
            output_filename = f"fund_aggregated_data{'_europe' if subset == 'europe' else ''}.csv"
        elif fund_type == "index fund":
            input_filename = "index_fund_data_van.csv" if subset == "none" else "index_fund_data_europe_van.csv"
            output_filename = f"index_fund_aggregated_data{'_europe' if subset == 'europe' else ''}.csv"
        elif fund_type == "active fund":
            input_filename = "active_fund_data_van.csv" if subset == "none" else "active_fund_data_europe_van.csv"
            output_filename = f"active_fund_aggregated_data{'_europe' if subset == 'europe' else ''}.csv"
        else:
            logging.warning(f"Unknown fund type: {fund_type}. Skipping aggregation.")
            continue

        input_path = os.path.join(config["filtered_dir"], input_filename)
        output_path = os.path.join(config["aggregated_dir"], output_filename)

        aggregation_function(input_file=input_path, output_file=output_path, fund_type=fund_type)

    apply_merge_function(subset=config["subset"])

if __name__ == "__main__":
    main()


## Calculate ownership percentage based on market capitalization held by the fund

In [None]:
import pandas as pd

def calculate_ownership_percentage(subset = "none"):
    output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final_van.csv"
    if subset == "europe":
        output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final_europe_van.csv"
    
    df = pd.read_csv(output_file, index_col=False)
    df['ETF_ownership'] = (df["stock_value_held"] * 1_000_000)/ df["market_cap"]
    df['FUND_ownership'] = (df["FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
    df['INDEX_FUND_ownership'] = (df["INDEX_FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
    df['ACTIVE_FUND_ownership'] = (df["ACTIVE_FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
    df.to_csv(output_file, index=False)
    display(df)

calculate_ownership_percentage(subset = "none")
calculate_ownership_percentage(subset = "europe")