## Merge all csv files

In [2]:
import pandas as pd
import os

def merge_csv_files(directory_path, output_file):
    # List to hold data from each CSV file
    dataframes = []

    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            # Construct full file path
            file_path = os.path.join(directory_path, filename)
            # Read the CSV file and append to the list
            print(file_path)
            df = pd.read_csv(file_path)
            dataframes.append(df)

    # Concatenate all dataframes in the list
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Write the merged dataframe to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"All CSV files have been merged into {output_file}")

# Usage
directory_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data'  # Update this path to the directory containing your CSV files
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to where you want the merged CSV to be saved
merge_csv_files(directory_path, output_file)


/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2010_01_2013_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2020_01_2020_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2021_01_2022_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2019_01_2019_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_

## ETF ownership

In [4]:
import dask.dataframe as dd
import pandas as pd

def filter_and_export_csv_dask(input_file, output_file):
    # Read the file using Dask
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=['fund_type', 'fund_name'])
    
    # Apply filtering conditions
    filtered_ddf = ddf[(ddf['fund_type'] == 'Exchange-Traded Fund') |
                       ddf['fund_name'].str.contains('Vanguard')]


    # Compute and write the result to a new CSV file
    filtered_ddf.compute().to_csv(output_file, index=False)
    print(f"Filtered data has been saved to {output_file}")

# Usage
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to your actual CSV file path
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv'  # Update this path to where you want the filtered data saved
filter_and_export_csv_dask(input_file, output_file)

filtered_ddf = pd.read_csv(output_file)

# adjust date
filtered_ddf['date'] = pd.to_datetime(filtered_ddf['date'])

# Subtract one month end to change the date to the last day of the previous month
filtered_ddf['date'] = filtered_ddf['date'] - pd.offsets.MonthEnd(1)

filtered_ddf.to_csv(output_file)

ValueError: Length of values (16351213) does not match length of index (2)

### Sum up etf ownership of each stock on a specific date

In [4]:

df = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv")

# Group by 'stock_RIC' and 'date', and sum the 'stock_value_held'
grouped_df = df.groupby(['stock_RIC', 'date'])[['stock_value_held', "percent_of_traded_shares"]].sum().reset_index()

# Export to new CSV
file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_etf_holdings_600_.csv"
grouped_df.to_csv(file_path, index=False)

## Fund ownership

In [5]:
import dask.dataframe as dd

def filter_and_export_csv_dask(input_file, output_file):
    # Read the file using Dask
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=['fund_type', 'fund_name'])
    
    # Apply filtering conditions
    filtered_ddf = ddf[(ddf['fund_type'] != 'Exchange-Traded Fund') |
                        ddf['fund_name'].str.contains('Vanguard') == False]
    
    # Compute and write the result to a new CSV file
    filtered_ddf.compute().to_csv(output_file, index=False)
    print(f"Filtered data has been saved to {output_file}")

# Usage
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to your actual CSV file path
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv'  # Update this path to where you want the filtered data saved
filter_and_export_csv_dask(input_file, output_file)

output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv'

filtered_ddf = pd.read_csv(output_file)

# adjust date
filtered_ddf['date'] = pd.to_datetime(filtered_ddf['date'])

# Subtract one month end to change the date to the last day of the previous month
filtered_ddf['date'] = filtered_ddf['date'] - pd.offsets.MonthEnd(1)

filtered_ddf.to_csv(output_file)

Filtered data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv


### Sum up fund ex etf ownership of each stock on a specific date

In [6]:
import dask.dataframe as dd
import pandas as pd
df = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv")

# Group by 'stock_RIC' and 'date', and sum the 'stock_value_held'
grouped_df = df.groupby(['stock_RIC', 'date'])[['stock_value_held', "percent_of_traded_shares"]].sum().reset_index()

grouped_df.rename(columns={'stock_value_held': 'FUND_stock_value_held'},  inplace=True)
grouped_df.rename(columns={'percent_of_traded_shares': 'FUND_percent_of_traded_shares'}, inplace=True)

# Export to new CSV
file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_fund_ex_holdings_600_.csv"
grouped_df.to_csv(file_path, index=False)

In [7]:
display(grouped_df)

Unnamed: 0,stock_RIC,date,FUND_stock_value_held
0,0MW4EUR.xbo^K15,2010-01-01,9.666637
1,0MW4EUR.xbo^K15,2010-02-01,9.598608
2,0MW4EUR.xbo^K15,2010-03-01,8.259427
3,0MW4EUR.xbo^K15,2010-04-01,9.845493
4,0MW4EUR.xbo^K15,2010-05-01,9.688948
...,...,...,...
154456,ZURN.S,2023-08-01,3874.537377
154457,ZURN.S,2023-09-01,3834.866671
154458,ZURN.S,2023-10-01,3802.279416
154459,ZURN.S,2023-11-01,3973.467134


# merge etf and fund ownership with index membership: formatted_final

In [7]:
import pandas as pd

def merge_csv_files(file1, file2, output_file, column_added):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    key_columns = ['date', 'stock_RIC']

    df1['date'] = pd.to_datetime(df1['date'])
    df2['date'] = pd.to_datetime(df2['date'])

    cols_to_keep = key_columns + column_added
    df2 = df2[cols_to_keep]

    merged_df = pd.merge(df1, df2, on=key_columns, how='left')

    # Fill NA values in 'stock_value_held' with the desired placeholder if no match was found
    merged_df[column_added].fillna('NA', inplace=True)

    # Write the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged data has been saved to {output_file}")


formatted_index_member = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/formated_constituents_stoxx_europe_600.csv'
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"

file_etf_ownership = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_etf_holdings_600_.csv'
file_fund_ownership = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_fund_ex_holdings_600_.csv"
m_stock_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/m_stock_level_data.csv'

merge_csv_files(formatted_index_member, file_etf_ownership, output_file, ['stock_value_held', "percent_of_traded_shares"])
merge_csv_files(output_file, file_fund_ownership, output_file, ["FUND_stock_value_held", "FUND_percent_of_traded_shares"])
merge_csv_files(output_file, m_stock_path, output_file, ['market_cap'])


TypeError: unhashable type: 'list'

## ETF ownership and Fund owership in percentage

In [17]:
import pandas as pd
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"
df = pd.read_csv(output_file)
#############

df['ETF_ownership'] = (df["stock_value_held"] * 1_000_000)/ df["market_cap"]
df['FUND_ownership'] = (df["FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
type(df)
display(df)

df.to_csv(output_file, index=False)

Unnamed: 0,date,stock_RIC,index_member,stock_value_held,FUND_stock_value_held,market_cap,ETF_ownership,FUND_ownership
0,2009-12-31,0MW4EUR.xbo^K15,0,14.731695,9.666637,,,
1,2010-01-31,0MW4EUR.xbo^K15,0,11.226477,9.598608,,,
2,2010-02-28,0MW4EUR.xbo^K15,0,9.887296,8.259427,,,
3,2010-03-31,0MW4EUR.xbo^K15,0,11.585353,9.845493,,,
4,2010-04-30,0MW4EUR.xbo^K15,0,15.047999,9.688948,,,
...,...,...,...,...,...,...,...,...
196219,2023-07-31,ZURN.S,1,5870.052977,3874.537377,6.606391e+10,0.088854,0.058648
196220,2023-08-31,ZURN.S,1,5816.278767,3834.866671,6.344198e+10,0.091679,0.060447
196221,2023-09-30,ZURN.S,1,5789.045875,3802.279416,6.357877e+10,0.091053,0.059804
196222,2023-10-31,ZURN.S,1,6022.326667,3973.467134,6.545329e+10,0.092010,0.060707


### ANOMALIES: remove duplicates from 2015-03-01 to 2015-11-01

In [10]:
import pandas as pd

path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv"

df = pd.read_csv(path)

## reomve duplicate rows
df = df.drop_duplicates()

df.to_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/test.csv")