## Merge all csv files

In [2]:
import pandas as pd
import os

def merge_csv_files(directory_path, output_file):
    # List to hold data from each CSV file
    dataframes = []

    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            # Construct full file path
            file_path = os.path.join(directory_path, filename)
            # Read the CSV file and append to the list
            print(file_path)
            df = pd.read_csv(file_path)
            dataframes.append(df)

    # Concatenate all dataframes in the list
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Write the merged dataframe to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"All CSV files have been merged into {output_file}")

# Usage
directory_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data'  # Update this path to the directory containing your CSV files
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to where you want the merged CSV to be saved
merge_csv_files(directory_path, output_file)


/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2010_01_2013_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2020_01_2020_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2021_01_2022_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2019_01_2019_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_

## ETF ownership

In [26]:
import dask.dataframe as dd
import pandas as pd

def filter_and_export_csv_dask(input_file, output_file):
    # Read the file using Dask
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=['fund_type', 'fund_name'])
    
    # Apply filtering conditions
    filtered_ddf = ddf[(ddf['fund_type'] == 'Exchange-Traded Fund') |
                       ddf['fund_name'].str.contains('Vanguard')]


    # Compute and write the result to a new CSV file
    filtered_ddf.compute().to_csv(output_file, index=False)
    print(f"Filtered data has been saved to {output_file}")

# Usage
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to your actual CSV file path
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv'  # Update this path to where you want the filtered data saved
filter_and_export_csv_dask(input_file, output_file)

filtered_ddf = pd.read_csv(output_file)

# adjust date
filtered_ddf['date'] = pd.to_datetime(filtered_ddf['date'])

# Subtract one month end to change the date to the last day of the previous month
filtered_ddf['date'] = filtered_ddf['date'] - pd.offsets.MonthEnd(1)

# Identify and drop unnamed columns
unnamed_cols = [col for col in filtered_ddf.columns if 'Unnamed' in col]
filtered_ddf = filtered_ddf.drop(columns=unnamed_cols)

# Drop rows with NAs in 'stock_RIC', 'date', or 'percent_of_traded_shares' columns
df_clean = filtered_ddf.dropna(subset=['stock_RIC', "fund_name", 'date', 'percent_of_traded_shares'])

# Drop duplicates based on 'stock_RIC', 'date', 'fund_name', and 'percent_of_traded_shares' columns
df_clean = df_clean.drop_duplicates(subset=['stock_RIC', 'fund_name', 'date', 'percent_of_traded_shares'], keep='first')
df_clean.to_csv(path, index=False)

df_clean.to_csv(output_file)

Filtered data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv


KeyError: "['Unnamed: 0'] not found in axis"

### Sum up etf ownership of each stock on a specific date

In [4]:

df = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv")

# Group by 'stock_RIC' and 'date', and sum the 'stock_value_held'
grouped_df = df.groupby(['stock_RIC', 'date'])[['stock_value_held', "percent_of_traded_shares"]].sum().reset_index()

# Export to new CSV
file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_etf_holdings_600_.csv"
grouped_df.to_csv(file_path, index=False)

## Fund ownership

In [5]:
import dask.dataframe as dd

def filter_and_export_csv_dask(input_file, output_file):
    # Read the file using Dask
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=['fund_type', 'fund_name'])
    
    # Apply filtering conditions
    filtered_ddf = ddf[(ddf['fund_type'] != 'Exchange-Traded Fund') |
                        ddf['fund_name'].str.contains('Vanguard') == False]
    
    # Compute and write the result to a new CSV file
    filtered_ddf.compute().to_csv(output_file, index=False)
    print(f"Filtered data has been saved to {output_file}")

# Usage
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to your actual CSV file path
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv'  # Update this path to where you want the filtered data saved
filter_and_export_csv_dask(input_file, output_file)

output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv'

filtered_ddf = pd.read_csv(output_file)

# adjust date
filtered_ddf['date'] = pd.to_datetime(filtered_ddf['date'])

# Subtract one month end to change the date to the last day of the previous month
filtered_ddf['date'] = filtered_ddf['date'] - pd.offsets.MonthEnd(1)

#####################################
unnamed_cols = [col for col in filtered_ddf.columns if 'Unnamed' in col]
filtered_ddf = df.drop(columns=unnamed_cols)

# Drop rows with NAs in 'stock_RIC', 'date', or 'percent_of_traded_shares' columns
df_clean = filtered_ddf.dropna(subset=['stock_RIC', "fund_name", 'date', 'percent_of_traded_shares'])

# Drop duplicates based on 'stock_RIC', 'date', 'fund_name', and 'percent_of_traded_shares' columns
df_clean = df_clean.drop_duplicates(subset=['stock_RIC', 'fund_name', 'date', 'percent_of_traded_shares'], keep='first')
df_clean.to_csv(path, index=False)

df_clean.to_csv(output_file)

filtered_ddf.to_csv(output_file)

Filtered data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv


### Sum up fund ex etf ownership of each stock on a specific date

In [6]:
import dask.dataframe as dd
import pandas as pd
df = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv")

# Group by 'stock_RIC' and 'date', and sum the 'stock_value_held'
grouped_df = df.groupby(['stock_RIC', 'date'])[['stock_value_held', "percent_of_traded_shares"]].sum().reset_index()

grouped_df.rename(columns={'stock_value_held': 'FUND_stock_value_held'},  inplace=True)
grouped_df.rename(columns={'percent_of_traded_shares': 'FUND_percent_of_traded_shares'}, inplace=True)

# Export to new CSV
file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_fund_ex_holdings_600_.csv"
grouped_df.to_csv(file_path, index=False)

In [7]:
display(grouped_df)

Unnamed: 0,stock_RIC,date,FUND_stock_value_held
0,0MW4EUR.xbo^K15,2010-01-01,9.666637
1,0MW4EUR.xbo^K15,2010-02-01,9.598608
2,0MW4EUR.xbo^K15,2010-03-01,8.259427
3,0MW4EUR.xbo^K15,2010-04-01,9.845493
4,0MW4EUR.xbo^K15,2010-05-01,9.688948
...,...,...,...
154456,ZURN.S,2023-08-01,3874.537377
154457,ZURN.S,2023-09-01,3834.866671
154458,ZURN.S,2023-10-01,3802.279416
154459,ZURN.S,2023-11-01,3973.467134


# merge etf and fund ownership with index membership: formatted_final

In [8]:
import pandas as pd

def merge_csv_files(file1, file2, output_file, column_added):
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    key_columns = ['date', 'stock_RIC']

    df1['date'] = pd.to_datetime(df1['date'])
    df2['date'] = pd.to_datetime(df2['date'])

    cols_to_keep = key_columns + column_added
    df2 = df2[cols_to_keep]

    merged_df = pd.merge(df1, df2, on=key_columns, how='left')

    # Fill NA values in 'stock_value_held' with the desired placeholder if no match was found
    merged_df[column_added].fillna('NA', inplace=True)

    # Write the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged data has been saved to {output_file}")


formatted_index_member = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/formated_constituents_stoxx_europe_600.csv'
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"

file_etf_ownership = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_etf_holdings_600_.csv'
file_fund_ownership = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_fund_ex_holdings_600_.csv"
m_stock_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/m_stock_level_data.csv'

merge_csv_files(formatted_index_member, file_etf_ownership, output_file, ['stock_value_held', "percent_of_traded_shares"])
merge_csv_files(output_file, file_fund_ownership, output_file, ["FUND_stock_value_held", "FUND_percent_of_traded_shares"])
merge_csv_files(output_file, m_stock_path, output_file, ['market_cap'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[column_added].fillna('NA', inplace=True)


Merged data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv


## ETF ownership and Fund owership in percentage

In [9]:
import pandas as pd
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"
df = pd.read_csv(output_file)
#############

df['ETF_ownership'] = (df["stock_value_held"] * 1_000_000)/ df["market_cap"]
df['FUND_ownership'] = (df["FUND_stock_value_held"] * 1_000_000)/ df["market_cap"]
type(df)
display(df)

df.to_csv(output_file, index=False)

Unnamed: 0,date,stock_RIC,index_member,stock_value_held,percent_of_traded_shares,FUND_stock_value_held,FUND_percent_of_traded_shares,market_cap,ETF_ownership,FUND_ownership
0,2009-12-31,0MW4EUR.xbo^K15,0,14.731695,0.7147,9.666637,0.4652,,,
1,2010-01-31,0MW4EUR.xbo^K15,0,11.226477,0.5347,9.598608,0.4610,,,
2,2010-02-28,0MW4EUR.xbo^K15,0,9.887296,0.5045,8.259427,0.4308,,,
3,2010-03-31,0MW4EUR.xbo^K15,0,11.585353,0.5287,9.845493,0.4528,,,
4,2010-04-30,0MW4EUR.xbo^K15,0,15.047999,0.6899,9.688948,0.4562,,,
...,...,...,...,...,...,...,...,...,...,...
196219,2023-07-31,ZURN.S,1,5870.052977,8.8860,3874.537377,5.8652,6.606391e+10,0.088854,0.058648
196220,2023-08-31,ZURN.S,1,5816.278767,9.1603,3834.866671,6.0371,6.344198e+10,0.091679,0.060447
196221,2023-09-30,ZURN.S,1,5789.045875,9.1040,3802.279416,5.9793,6.357877e+10,0.091053,0.059804
196222,2023-10-31,ZURN.S,1,6022.326667,9.2026,3973.467134,6.0717,6.545329e+10,0.092010,0.060707


### ANOMALIES: remove duplicates from 2015-03-01 to 2015-11-01

In [10]:
import pandas as pd

path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv"

df = pd.read_csv(path)

## reomve duplicate rows
df = df.drop_duplicates()

df.to_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/test.csv")

### check for duplicates

In [25]:
import pandas as pd
path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv"
df = pd.read_csv(path, index_col = False)

# Identify and drop unnamed columns
unnamed_cols = [col for col in df.columns if 'Unnamed' in col]
df = df.drop(columns=unnamed_cols)

# Drop rows with NAs in 'stock_RIC', 'date', or 'percent_of_traded_shares' columns
df_clean = df.dropna(subset=['stock_RIC', "fund_name", 'date', 'percent_of_traded_shares'])

# Check for duplicates based on 'stock_RIC', 'date', and 'percent_of_traded_shares' columns
column_duplicates = df_clean.duplicated(subset=['stock_RIC',"fund_name", 'date', 'percent_of_traded_shares'], keep=False)

# Filter the DataFrame to show only duplicated rows
duplicates_df = df_clean[column_duplicates]

# Sort the duplicated rows alphabetically by 'fund_name'
sorted_duplicates_df = duplicates_df.sort_values(by=["date", 'fund_name'])

# Print out the sorted duplicated rows
display(sorted_duplicates_df)

Unnamed: 0,stock_RIC,fund_type_parent,fund_type,fund_investment_type,fund_name,market_cap_fund,stock_value_held,percent_of_traded_shares,percent_of_fund_holdings,country,filing_date,date
1582087,AAL.L,Funds,Exchange-Traded Fund,Index,1nvest SWIX 40 ETF,79.707761,10.195470,0.0365,,South Africa,2013-03-31T00:00:00Z,2013-04-30
1587348,AAL.L,Funds,Exchange-Traded Fund,Index,1nvest SWIX 40 ETF,79.707761,10.195470,0.0365,,South Africa,2013-03-31T00:00:00Z,2013-04-30
1582114,AAL.L,Funds,Exchange-Traded Fund,Index,1nvest TOP 40 ETF,31.215607,2.995011,0.0092,,South Africa,2012-12-31T00:00:00Z,2013-04-30
1587375,AAL.L,Funds,Exchange-Traded Fund,Index,1nvest TOP 40 ETF,31.215607,2.995011,0.0092,,South Africa,2012-12-31T00:00:00Z,2013-04-30
1582109,AAL.L,Funds,Exchange-Traded Fund,Index,AMUNDI ETF MSCI UK UCITS ETF Fund,10.542743,2.813120,0.0109,,France,2013-04-30T00:00:00Z,2013-04-30
...,...,...,...,...,...,...,...,...,...,...,...,...
3516489,CNHI.MI^A24,Funds,Exchange-Traded Fund,Index,iShares US Dividend Growers Index ETF (CAD-Hed...,231.157250,0.000000,0.0000,,United States,2014-02-28T00:00:00Z,2015-10-31
15232414,CNHI.K,Funds,Exchange-Traded Fund,Index,iShares US Dividend Growers Index ETF (CAD-Hed...,232.584669,0.000000,0.0000,,United States,2014-02-28T00:00:00Z,2015-10-31
15232559,CNHI.MI^A24,Funds,Exchange-Traded Fund,Index,iShares US Dividend Growers Index ETF (CAD-Hed...,232.584669,0.000000,0.0000,,United States,2014-02-28T00:00:00Z,2015-10-31
3528089,FRO.OL,Funds,Exchange-Traded Fund,Index,iShares US Fundamental Index ETF,134.752383,0.000000,0.0000,,United States,2015-05-31T00:00:00Z,2015-10-31


In [24]:
len(sorted_duplicates_df)

filtered_df = df[(df['stock_RIC'] == 'JD.L') & (df['date'] == '2015-10-31')]

display(filtered_df)

Unnamed: 0,stock_RIC,fund_type_parent,fund_type,fund_investment_type,fund_name,market_cap_fund,stock_value_held,percent_of_traded_shares,percent_of_fund_holdings,country,filing_date,date
3538790,JD.L,Funds,Exchange-Traded Fund,Index,First Trust Europe AlphaDEX Fund,192.599688,3.000414,0.1139,,United States,2015-10-31T00:00:00Z,2015-10-31
3538791,JD.L,Funds,Exchange-Traded Fund,Index,iShares FTSE 250 UCITS ETF,882.735056,2.582939,0.098,0.16,United Kingdom,2015-10-31T00:00:00Z,2015-10-31
3538792,JD.L,Funds,Exchange-Traded Fund,Index,Wisdomtree Europe SmallCap Dividend Fund,176.170377,1.826832,0.0693,0.2,United States,2015-10-31T00:00:00Z,2015-10-31
3538793,JD.L,Funds,Exchange-Traded Fund,Index,First Trust Developed Markets Ex-US AlphaDEX Fund,374.621202,0.753264,0.0286,,United States,2015-10-31T00:00:00Z,2015-10-31
3538794,JD.L,Funds,Exchange-Traded Fund,Index,Xtrackers FTSE 250 UCITS ETF,33.847947,0.315093,0.012,,Germany,2015-10-31T00:00:00Z,2015-10-31
3538795,JD.L,Funds,Exchange-Traded Fund,Index,Vanguard FTSE 250 UCITS ETF,2220.572383,0.06628,0.0043,,United States,2015-04-30T00:00:00Z,2015-10-31
3538796,JD.L,Funds,Exchange-Traded Fund,Index,HSBC FTSE 250 ETF Fund,51.98762,0.111492,0.0042,,United Kingdom,2015-10-31T00:00:00Z,2015-10-31
3538797,JD.L,Funds,Exchange-Traded Fund,Index,WisdomTree Europe Small Cap Dividend UCITS ETF...,53.196211,0.069031,0.0026,,Ireland,2015-10-31T00:00:00Z,2015-10-31
3538798,JD.L,Funds,Exchange-Traded Fund,Index,Invesco FTSE 250 UCITS ETF Fund,23.97261,0.064375,0.0024,,Germany,2015-10-31T00:00:00Z,2015-10-31
3538799,JD.L,Funds,Exchange-Traded Fund,Index,Invesco S&P International Developed Quality ETF,252.001154,0.042691,0.0016,,United States,2015-10-31T00:00:00Z,2015-10-31
