## Merge all csv files

In [1]:
import pandas as pd
import os

def merge_csv_files(directory_path, output_file):
    # List to hold data from each CSV file
    dataframes = []

    # Loop through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            # Construct full file path
            file_path = os.path.join(directory_path, filename)
            # Read the CSV file and append to the list
            print(file_path)
            df = pd.read_csv(file_path)
            dataframes.append(df)

    # Concatenate all dataframes in the list
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Write the merged dataframe to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"All CSV files have been merged into {output_file}")

# Usage
directory_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data'  # Update this path to the directory containing your CSV files
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to where you want the merged CSV to be saved
merge_csv_files(directory_path, output_file)


/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2010_01_2013_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2014_01_2015_11.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2020_01_2020_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2021_01_2022_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_2019_01_2019_12.csv
/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/raw_data/etf_holdings_600_stocks_

## ETF ownership

In [2]:
import dask.dataframe as dd

def filter_and_export_csv_dask(input_file, output_file):
    # Read the file using Dask
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=['fund_type', 'fund_name'])
    
    # Apply filtering conditions
    filtered_ddf = ddf[(ddf['fund_type'] == 'Exchange-Traded Fund') |
                       ddf['fund_name'].str.contains('Vanguard')]
    
    # Compute and write the result to a new CSV file
    filtered_ddf.compute().to_csv(output_file, index=False)
    print(f"Filtered data has been saved to {output_file}")

# Usage
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to your actual CSV file path
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv'  # Update this path to where you want the filtered data saved
filter_and_export_csv_dask(input_file, output_file)


  df = reader(bio, **kwargs)


Filtered data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv


## Sum up etf ownership of each stock on a specific date

In [3]:

df = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/etf_merged_holdings.csv")

# Group by 'stock_RIC' and 'date', and sum the 'stock_value_held'
grouped_df = df.groupby(['stock_RIC', 'date'])['stock_value_held'].sum().reset_index()

# Export to new CSV
file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_etf_holdings_600_.csv"
grouped_df.to_csv(file_path, index=False)

## Fund ownership

In [4]:
import dask.dataframe as dd

def filter_and_export_csv_dask(input_file, output_file):
    # Read the file using Dask
    ddf = dd.read_csv(input_file)

    # Drop rows with NaN in 'fund_type' or 'fund_name'
    ddf = ddf.dropna(subset=['fund_type', 'fund_name'])
    
    # Apply filtering conditions
    filtered_ddf = ddf[(ddf['fund_type'] != 'Exchange-Traded Fund') |
                        ddf['fund_name'].str.contains('Vanguard') == False]
    
    # Compute and write the result to a new CSV file
    filtered_ddf.compute().to_csv(output_file, index=False)
    print(f"Filtered data has been saved to {output_file}")

# Usage
input_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/merged_holdings.csv'  # Update this path to your actual CSV file path
output_file = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv'  # Update this path to where you want the filtered data saved
filter_and_export_csv_dask(input_file, output_file)

  df = reader(bio, **kwargs)


Filtered data has been saved to /Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv


## Sum up fund ex etf ownership of each stock on a specific date

In [5]:
import dask.dataframe as dd

df = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/fund_ex_merged_holdings.csv")

# Group by 'stock_RIC' and 'date', and sum the 'stock_value_held'
grouped_df = df.groupby(['stock_RIC', 'date'])['stock_value_held'].sum().reset_index()

# Export to new CSV
file_path = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_fund_ex_holdings_600_.csv"
grouped_df.to_csv(file_path, index=False)

# merge etf and fund ownership with index membership

In [7]:
import pandas as pd

def merge_csv_files(file1, file2, output_file, column_added):
    # Read the CSV files into pandas DataFrames
    df1 = pd.read_csv(file1)
    print(df1)
    df2 = pd.read_csv(file2)
    print(df2)

    key_columns = ['date', 'stock_RIC']

    # Convert date columns to datetime if not already (optional, ensure format matches)
    df1['date'] = pd.to_datetime(df1['date'])
    df2['date'] = pd.to_datetime(df2['date'])

    df2 = df2[["date", "stock_RIC", column_added]]

    # Perform the merge operation
    merged_df = pd.merge(df1, df2, on=key_columns, how='left')

    # Fill NA values in 'stock_value_held' with the desired placeholder if no match was found
    merged_df[column_added].fillna('NA', inplace=True)

    # Write the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)
    print(f"Merged data has been saved to {output_file}")

# Usage
formatted_index_member = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/formated_constituents_stoxx_europe_600.csv'
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"

file_etf_ownership = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_etf_holdings_600_.csv'
file_fund_ownership = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/aggregated_fund_ex_holdings_600_.csv"
m_stock_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/m_stock_level_data.csv'

merge_csv_files(formatted_index_member, file_etf_ownership, output_file, 'stock_value_held')
merge_csv_files(output_file, m_stock_path, output_file, 'market_cap')

#merge_csv_files(formatted_index_member, file_fund_ownership, formatted_index_member, "")

              date        stock_RIC  index_member
0       2010-01-01  0MW4EUR.xbo^K15             0
1       2010-02-01  0MW4EUR.xbo^K15             0
2       2010-03-01  0MW4EUR.xbo^K15             0
3       2010-04-01  0MW4EUR.xbo^K15             0
4       2010-05-01  0MW4EUR.xbo^K15             0
...            ...              ...           ...
196219  2023-08-01           ZURN.S             1
196220  2023-09-01           ZURN.S             1
196221  2023-10-01           ZURN.S             1
196222  2023-11-01           ZURN.S             1
196223  2023-12-01           ZURN.S             1

[196224 rows x 3 columns]
              stock_RIC        date  stock_value_held
0       0MW4EUR.xbo^K15  2010-01-01         14.731695
1       0MW4EUR.xbo^K15  2010-02-01         11.226477
2       0MW4EUR.xbo^K15  2010-03-01          9.887296
3       0MW4EUR.xbo^K15  2010-04-01         11.585353
4       0MW4EUR.xbo^K15  2010-05-01         15.047999
...                 ...         ...              

## ETF ownership in percentage

In [11]:
import pandas as pd
output_file = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/fund_holdings_data/formatted_final.csv"
df = pd.read_csv(output_file)

df['ETF_ownership'] = (df["stock_value_held"] * 1_000_000)/ df["market_cap"]

display(df)

df.to_csv(output_file, index=False)

Unnamed: 0,date,stock_RIC,index_member,stock_value_held,market_cap,ETF_ownership
0,2010-01-01,0MW4EUR.xbo^K15,0,14.731695,,
1,2010-02-01,0MW4EUR.xbo^K15,0,11.226477,,
2,2010-03-01,0MW4EUR.xbo^K15,0,9.887296,,
3,2010-04-01,0MW4EUR.xbo^K15,0,11.585353,,
4,2010-05-01,0MW4EUR.xbo^K15,0,15.047999,,
...,...,...,...,...,...,...
196237,2023-08-01,ZURN.S,1,5870.052977,6.559551e+10,0.089489
196238,2023-09-01,ZURN.S,1,5816.278767,6.521676e+10,0.089184
196239,2023-10-01,ZURN.S,1,5789.045875,6.373943e+10,0.090824
196240,2023-11-01,ZURN.S,1,6022.326667,6.283013e+10,0.095851
