## Merge all csv files

In [5]:
import pandas as pd
import os

def merge_csv_files(directory_path, output_file):
    # List to hold data from each CSV file
    dataframes = []

    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            # Construct full file path
            file_path = os.path.join(directory_path, filename)
            # Read the CSV file and append to the list
            print(file_path)
            df = pd.read_csv(file_path)
            dataframes.append(df)

    # Concatenate all dataframes in the list
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Write the merged dataframe to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"All CSV files have been merged into {output_file}")

# Usage
raw_data_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data'
merged_data_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_data.csv'
merge_csv_files(raw_data_path, merged_data_path)

/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2010_01_2013_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/test.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2020_01_2020_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2021_01_2022_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2019_01_2019_12.csv
/Users/jonathan

# Separate ETFs and traditional Funds

## filter_function

In [2]:
import dask.dataframe as dd
import pandas as pd

def filter_function(input_file, output_file, type = "etf"):
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=["stock_RIC", 'fund_type', 'fund_name', "percent_of_traded_shares"])
    
    if type == "etf":
        filtered_ddf = ddf[(ddf['fund_type'] == 'Exchange-Traded Fund')] #|
                        #ddf['fund_name'].str.contains('Vanguard')]
    elif type == "fund":
        filtered_ddf = ddf[(ddf['fund_type'] != 'Exchange-Traded Fund')] #|
                        # ddf['fund_name'].str.contains('Vanguard') == False]
    
    filtered_ddf = filtered_ddf.persist()

    filtered_df = filtered_ddf.compute()

    filtered_df['date'] = pd.to_datetime(filtered_df['date'])

    # Subtract one month end to change the date to the last day of the previous month
    filtered_df['date'] = filtered_df['date'] - pd.offsets.MonthEnd(1)

    # Identify and drop unnamed columns
    df_clean = filtered_df.loc[:, ~filtered_df.columns.str.contains('Unnamed')]

    df_clean = df_clean.drop_duplicates(subset=['stock_RIC', 'fund_name', 'date', 'percent_of_traded_shares'], keep='first')

    # Compute and write the result to a new CSV file
    df_clean.to_csv(output_file)

## ETFs

In [19]:
import dask.dataframe as dd
import pandas as pd
    
########## ETFs
input_file_test = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv"
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_data.csv'
filter_function(input_file, output_file, type = "etf")

## Traditional Funds

In [3]:
import dask.dataframe as dd
import pandas as pd

########## traditional funds
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_data.csv'
filter_function(input_file, output_file, type = "fund")

# Aggregation of Ownership

## aggregation_function

In [23]:
import pandas as pd
def aggregation_function(input_file, output_file, type = "etf"):
    df = pd.read_csv(input_file, index_col=False)
    
    grouped_df = df.groupby(['stock_RIC', 'date'])[['stock_value_held', "percent_of_traded_shares"]].sum().reset_index()

    if type == "fund":
        grouped_df.rename(columns={'stock_value_held': 'FUND_stock_value_held'},  inplace=True)
        grouped_df.rename(columns={'percent_of_traded_shares': 'FUND_percent_of_traded_shares'}, inplace=True)

    grouped_df.to_csv(output_file, index=False)

## ETF ownership

In [24]:
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_aggregated_data.csv'
aggregation_function(input_file, output_file, type = "etf")

## Fund ownership

In [25]:
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_data.csv'
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_aggregated_data.csv'
aggregation_function(input_file, output_file, type = "fund")

# Merge aggregated ETF and Fund ownership

In [27]:
import pandas as pd

def merge_csv_files(file1, file2, output_file, column_added):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    key_columns = ['date', 'stock_RIC']

    df1['date'] = pd.to_datetime(df1['date'])
    df2['date'] = pd.to_datetime(df2['date'])

    cols_to_keep = key_columns + column_added
    df2 = df2[cols_to_keep]

    merged_df = pd.merge(df1, df2, on=key_columns, how='left')

    # Fill NA values in 'stock_value_held' with the desired placeholder if no match was found
    merged_df[column_added].fillna('NA', inplace=True)

    # Write the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged data has been saved to {output_file}")


formatted_index_member = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/formated_constituents_stoxx_europe_600.csv'
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"

file_etf_ownership = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_aggregated_data.csv'
file_fund_ownership = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_aggregated_data.csv"
m_stock_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/m_stock_level_data.csv'

##### merge etf ownership with index memeber
merge_csv_files(formatted_index_member, file_etf_ownership, output_file, ['stock_value_held', "percent_of_traded_shares"])

##### merge fund ownership with the output file from the last function
merge_csv_files(output_file, file_fund_ownership, output_file, ["FUND_stock_value_held", "FUND_percent_of_traded_shares"])

#### add stock level data
merge_csv_files(output_file, m_stock_path, output_file, ['market_cap'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


## calculate ownership percentage based on market capitalization

In [28]:
import pandas as pd

def calculate_ownership_percentage(file_path):
    df = pd.read_csv(output_file, index_col=False)
    df['ETF_ownership'] = (df["stock_value_held"] * 1_000_000)/ df["market_cap"]
    df['FUND_ownership'] = (df["FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
    df.to_csv(output_file, index=False)
    display(df)


file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"
calculate_ownership_percentage(file_path)




Unnamed: 0,date,stock_RIC,index_member,stock_value_held,percent_of_traded_shares,FUND_stock_value_held,FUND_percent_of_traded_shares,market_cap,ETF_ownership,FUND_ownership
0,2009-12-31,0MW4EUR.xbo^K15,0,14.731695,0.7147,9.666637,0.4652,,,
1,2010-01-31,0MW4EUR.xbo^K15,0,11.226477,0.5347,9.598608,0.4610,,,
2,2010-02-28,0MW4EUR.xbo^K15,0,9.887296,0.5045,8.259427,0.4308,,,
3,2010-03-31,0MW4EUR.xbo^K15,0,11.585353,0.5287,9.845493,0.4528,,,
4,2010-04-30,0MW4EUR.xbo^K15,0,15.047999,0.6899,9.688948,0.4562,,,
...,...,...,...,...,...,...,...,...,...,...
196219,2023-07-31,ZURN.S,1,5870.052977,8.8860,3874.537377,5.8652,6.606391e+10,0.088854,0.058648
196220,2023-08-31,ZURN.S,1,5816.278767,9.1603,3834.866671,6.0371,6.344198e+10,0.091679,0.060447
196221,2023-09-30,ZURN.S,1,5789.045875,9.1040,3802.279416,5.9793,6.357877e+10,0.091053,0.059804
196222,2023-10-31,ZURN.S,1,6022.326667,9.2026,3973.467134,6.0717,6.545329e+10,0.092010,0.060707
