## Merge all csv files

In [5]:
import pandas as pd
import os

def merge_csv_files(directory_path, output_file):
    # List to hold data from each CSV file
    dataframes = []

    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            # Construct full file path
            file_path = os.path.join(directory_path, filename)
            # Read the CSV file and append to the list
            print(file_path)
            df = pd.read_csv(file_path)
            dataframes.append(df)

    # Concatenate all dataframes in the list
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Write the merged dataframe to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"All CSV files have been merged into {output_file}")

# Usage
raw_data_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data'
merged_data_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_data.csv'
merge_csv_files(raw_data_path, merged_data_path)

/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2010_01_2013_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/test.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2020_01_2020_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2021_01_2022_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2019_01_2019_12.csv
/Users/jonathan

# Separate ETFs and traditional Funds

## filter_function

In [1]:
import dask.dataframe as dd
import pandas as pd

def filter_function(input_file, output_file, type = "etf"):
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=["stock_RIC", 'fund_type', 'fund_name', "percent_of_traded_shares"])
    
    if type == "etf":
        filtered_ddf = ddf[(ddf['fund_type'] == 'Exchange-Traded Fund')] #|
                        #ddf['fund_name'].str.contains('Vanguard')]
    elif type == "mutual fund":
        filtered_ddf = ddf[(ddf['fund_type'] != 'Exchange-Traded Fund')] #|
                        # ddf['fund_name'].str.contains('Vanguard') == False]
    
    # type = "index fund" or type = "active fund" needs mutual fund data as input_file
    elif type == "index fund":
        filtered_ddf = ddf[(ddf['fund_investment_type'] == 'Index')]
    
    elif type == "active fund":
        filtered_ddf = ddf[(ddf['fund_investment_type'] != 'Index')]

    
    filtered_ddf = filtered_ddf.persist()
    filtered_df = filtered_ddf.compute()

    if type == "etf" or type == "mutual fund":
        filtered_df['date'] = pd.to_datetime(filtered_df['date'])

        # Subtract one month end to change the date to the last day of the previous month
        filtered_df['date'] = filtered_df['date'] - pd.offsets.MonthEnd(1)

    # Identify and drop unnamed columns
    df_clean = filtered_df.loc[:, ~filtered_df.columns.str.contains('Unnamed')]
    df_clean = df_clean.drop_duplicates(subset=['stock_RIC', 'fund_name', 'date', 'percent_of_traded_shares'], keep='first')

    # Compute and write the result to a new CSV file
    df_clean.to_csv(output_file)

## ETFs

In [19]:
import dask.dataframe as dd
import pandas as pd
    
########## ETFs
input_file_test = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv"
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_data.csv'
filter_function(input_file, output_file, type = "etf")

## Mutual Funds

In [3]:
import dask.dataframe as dd
import pandas as pd

########## traditional funds
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_data.csv'
filter_function(input_file, output_file, type = "mutual fund")

## Index Funds

In [2]:
import dask.dataframe as dd
import pandas as pd

########## traditional funds
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/index_fund_data.csv'
filter_function(input_file, output_file, type = "index fund")

## Active Funds

In [3]:
import dask.dataframe as dd
import pandas as pd

########## traditional funds
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/active_fund_data.csv'
filter_function(input_file, output_file, type = "active fund")

# Aggregation of Ownership

## aggregation_function

In [4]:
import pandas as pd
def aggregation_function(input_file, output_file, type = "etf"):
    df = pd.read_csv(input_file, index_col=False)
    
    grouped_df = df.groupby(['stock_RIC', 'date'])[['stock_value_held', "percent_of_traded_shares"]].sum().reset_index()

    if type == "mutual fund":
        grouped_df.rename(columns={'stock_value_held': 'FUND_stock_value_held'},  inplace=True)
        grouped_df.rename(columns={'percent_of_traded_shares': 'FUND_percent_of_traded_shares'}, inplace=True)
    
    elif type == "index fund":
        grouped_df.rename(columns={'stock_value_held': 'INDEX_FUND_stock_value_held'},  inplace=True)
        grouped_df.rename(columns={'percent_of_traded_shares': 'INDEX_FUND_percent_of_traded_shares'}, inplace=True)
    
    elif type == "active fund":
        grouped_df.rename(columns={'stock_value_held': 'ACTIVE_FUND_stock_value_held'},  inplace=True)
        grouped_df.rename(columns={'percent_of_traded_shares': 'ACTIVE_FUND_percent_of_traded_shares'}, inplace=True)

    grouped_df.to_csv(output_file, index=False)

## ETF ownership

In [2]:
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/etf_aggregated_data.csv'
aggregation_function(input_file, output_file, type = "etf")

## Mutual Fund ownership

In [3]:
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/fund_aggregated_data.csv'
aggregation_function(input_file, output_file, type = "mutual fund")

## Index Fund ownership

In [5]:
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/index_fund_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/index_fund_aggregated_data.csv'
aggregation_function(input_file, output_file, type = "index fund")

## Active Fund ownership

In [6]:
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/active_fund_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/active_fund_aggregated_data.csv'
aggregation_function(input_file, output_file, type = "active fund")

# Merge aggregated ETF and Fund ownership

In [8]:
import pandas as pd

def merge_csv_files(file1, file2, output_file, column_added):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    key_columns = ['date', 'stock_RIC']

    df1['date'] = pd.to_datetime(df1['date'])
    df2['date'] = pd.to_datetime(df2['date'])

    cols_to_keep = key_columns + column_added
    df2 = df2[cols_to_keep]

    merged_df = pd.merge(df1, df2, on=key_columns, how='left')

    # Fill NA values in 'stock_value_held' with the desired placeholder if no match was found
    merged_df[column_added].fillna('NA', inplace=True)

    # Write the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged data has been saved to {output_file}")


formatted_index_member = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/formated_constituents_stoxx_europe_600.csv'
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"

file_etf = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/etf_aggregated_data.csv'
file_mutual_fund = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/fund_aggregated_data.csv"
file_index_fund = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/index_fund_aggregated_data.csv"
file_active_fund = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_data/active_fund_aggregated_data.csv"

file_m_stock = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/m_stock_level_data.csv'

##### merge etf ownership with index memeber
merge_csv_files(formatted_index_member, file_etf, output_file, ['stock_value_held', "percent_of_traded_shares"])

##### merge fund ownership with the output file from the last function
merge_csv_files(output_file, file_mutual_fund, output_file, ["FUND_stock_value_held", "FUND_percent_of_traded_shares"])
merge_csv_files(output_file, file_index_fund, output_file, ["INDEX_FUND_stock_value_held", "INDEX_FUND_percent_of_traded_shares"])
merge_csv_files(output_file, file_active_fund, output_file, ["ACTIVE_FUND_stock_value_held", "ACTIVE_FUND_percent_of_traded_shares"])

#### add stock level data
merge_csv_files(output_file, file_m_stock, output_file, ['market_cap'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


## calculate ownership percentage based on market capitalization

In [9]:
import pandas as pd

def calculate_ownership_percentage(file_path):
    df = pd.read_csv(output_file, index_col=False)
    df['ETF_ownership'] = (df["stock_value_held"] * 1_000_000)/ df["market_cap"]
    df['FUND_ownership'] = (df["FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
    df['INDEX_FUND_ownership'] = (df["INDEX_FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
    df['ACTIVE_FUND_ownership'] = (df["ACTIVE_FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
    df.to_csv(output_file, index=False)
    display(df)


file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"
calculate_ownership_percentage(file_path)




Unnamed: 0,date,stock_RIC,index_member,stock_value_held,percent_of_traded_shares,FUND_stock_value_held,FUND_percent_of_traded_shares,INDEX_FUND_stock_value_held,INDEX_FUND_percent_of_traded_shares,ACTIVE_FUND_stock_value_held,ACTIVE_FUND_percent_of_traded_shares,market_cap,ETF_ownership,FUND_ownership,INDEX_FUND_ownership,ACTIVE_FUND_ownership
0,2009-12-31,0MW4EUR.xbo^K15,0,10.386378,0.5011,119.213083,5.6151,11.926645,0.6070,107.286438,5.0081,,,,,
1,2010-01-31,0MW4EUR.xbo^K15,0,10.497117,0.5016,112.354494,5.2454,8.299856,0.4206,104.054639,4.8248,,,,,
2,2010-02-28,0MW4EUR.xbo^K15,0,9.157936,0.4714,136.100369,6.4882,7.903669,0.4086,128.196700,6.0796,,,,,
3,2010-03-31,0MW4EUR.xbo^K15,0,10.744003,0.4934,149.553185,6.7480,7.153383,0.3600,142.399802,6.3880,,,,,
4,2010-04-30,0MW4EUR.xbo^K15,0,9.726336,0.4578,139.564954,6.2827,14.385590,0.6743,125.179364,5.6084,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196219,2023-07-31,ZURN.S,1,4180.070271,6.3278,19064.195360,28.8973,5710.362177,8.6452,13353.833183,20.2521,6.606391e+10,0.063273,0.288572,0.086437,0.202135
196220,2023-08-31,ZURN.S,1,4139.611296,6.5174,18901.923118,29.6819,5687.507826,8.9590,13214.415292,20.7229,6.344198e+10,0.065250,0.297940,0.089649,0.208291
196221,2023-09-30,ZURN.S,1,4109.144161,6.4619,18625.671378,29.2231,5646.716146,8.8776,12978.955232,20.3455,6.357877e+10,0.064631,0.292954,0.088814,0.204140
196222,2023-10-31,ZURN.S,1,4290.094268,6.5556,19011.235820,29.2812,5854.380449,8.9551,13156.855371,20.3261,6.545329e+10,0.065544,0.290455,0.089444,0.201011
