# merge all csv files

### merge function

In [3]:
import pandas as pd
import os
from datetime import datetime

def merge_files(xlsx_dir, output_file_path):
    merged_df = pd.DataFrame()

    # Loop through all the files in the directory
    for file_name in os.listdir(xlsx_dir):
        if file_name.endswith('.xlsx'):
            file_path = os.path.join(xlsx_dir, file_name)
            
            # Read the .xlsx file, assuming date is in cell B2 (second row, second column)
            date_cell = pd.read_excel(file_path, header=None, usecols="B", skiprows=1, nrows=1).iloc[0, 0]
            # Parse the date string to a datetime object and reformat it
            date_obj = datetime.strptime(date_cell, '%d-%b-%Y')
            formatted_date = date_obj.strftime('%Y-%m-%d')
            
            # Read the rest of the .xlsx file, skipping the first two rows (header and date)
            df = pd.read_excel(file_path, skiprows=2)
            df['date'] = formatted_date
            
            merged_df = pd.concat([merged_df, df], ignore_index=True)

    # Rename columns as required
    merged_df.columns = ["stock_RIC", "stock_name", "country", "weight", "shares", "change", "date"]
    # Remove duplicates based on stock_name
    unique_df = merged_df.drop_duplicates(subset='stock_name')

    # Write to the specified output file path
    unique_df.to_csv(output_file_path, index=False)

    return merged_df

### STOXX Europe 50

In [4]:
raw_files = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_proxi_data/stoxx_europe_50_raw_data'
output_file_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_proxi_data/stoxx_europe_50_RIC_mapping.csv'
merged_df = merge_files(raw_files, output_file_path)

display(merged_df)

Unnamed: 0,stock_RIC,stock_name,country,weight,shares,change,date
0,RIC,Name,Country,Weight,No. Shares,Change,2020-07-31
1,AXAF.PA,AXA SA ORD,FRANCE,0.009833,95944,0,2020-07-31
2,SGEF.PA,VINCI SA ORD,FRANCE,0.011388,25741,0,2020-07-31
3,DGE.L,DIAGEO PLC ORD,UNITED KINGDOM,0.020469,108345,0,2020-07-31
4,ASML.AS,ASML HOLDING NV ORD,NETHERLANDS,0.035946,19737,0,2020-07-31
...,...,...,...,...,...,...,...
8855,ABI.BR,ANHEUSER-BUSCH INBEV SA ORD,BELGIUM,0.02362,120714,-365,2015-02-28
8856,BAYGn.DE,BAYER AG ORD,GERMANY,0.029462,129580,-760,2015-02-28
8857,BASFn.DE,BASF SE ORD,GERMANY,0.021621,144243,-503,2015-02-28
8858,GLEN.L,GLENCORE PLC ORD,SWITZERLAND,0.011022,1553937,-7056,2015-02-28


# Handling misssing data

### function

In [7]:
import pandas as pd
from datetime import datetime

def process_and_fill_dates(data, output_file_path):
    # Ensure 'Date' column is in datetime format
    data['date'] = pd.to_datetime(data['date'])
    data.sort_values('date', inplace=True)

    def fill_missing_months(data):
        unique_dates = data['date'].unique()
        min_date, max_date = min(unique_dates), max(unique_dates)
        all_dates = pd.date_range(start=min_date, end=max_date, freq='M').to_series()  # Generate all possible months
        
        last_valid_data = None
        results = []

        # Loop through each month in the generated date range
        for current_date in all_dates:
            if current_date in unique_dates:
                # If current month data is present, update last_valid_data
                last_valid_data = data[data['date'] == current_date]
            elif last_valid_data is not None:
                # If no data for the current month, copy last valid data and change the date
                temp_data = last_valid_data.copy()
                temp_data['date'] = current_date
                results.append(temp_data)
            else:
                # If no previous data is available (unlikely), continue without action
                continue

        # Concatenate all results with original data and re-sort
        if results:
            data = pd.concat([data] + results, ignore_index=True)
            data.sort_values('date', inplace=True)

        return data

    # Apply the function to fill missing months
    filled_data = fill_missing_months(merged_df)
    display(filled_data)

    filled_data.to_csv(output_file_path, index=False)

### STOXX Europe 50

In [8]:
output_file_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_proxi_data/stoxx_europe_50_index_proxi_merge.csv'
process_and_fill_dates(merged_df, output_file_path)

Unnamed: 0,stock_RIC,stock_name,country,weight,shares,change,date
0,BAYGn.DE,BAYER AG ORD,GERMANY,0.016491,141476,535,2010-01-31
28,ENI.MI,ENI SPA ORD,ITALY,0.016629,416904,1576,2010-01-31
29,ORAN.PA,ORANGE SA ORD,FRANCE,0.013004,332010,-13596,2010-01-31
30,GSK.L,GSK PLC ORD,UNITED KINGDOM,0.029601,895859,3388,2010-01-31
31,INGATEUR.xbo^G16,ING GROEP NV DR,NETHERLANDS,0.010685,660696,2501,2010-01-31
...,...,...,...,...,...,...,...
8825,SASY.PA,SANOFI SA ORD,FRANCE,0.020813,25540,146,2023-12-31
8826,INGA.AS,ING GROEP NV ORD,NETHERLANDS,0.009396,76515,-3596,2023-12-31
8827,,USD CASH,UNITED STATES,0,50,-118258,2023-12-31
8815,ROG.S,ROCHE HOLDING AG,SWITZERLAND,0.037364,15650,88,2023-12-31


# index member dummy variable

### functions

In [9]:
import pandas as pd

def process_index_etf_data(etf_file_path, output_file_path, index_file_path = False):
    # Load ETF and index data from specified paths
    filled_data = pd.read_csv(etf_file_path)
    etf_data = filled_data[["stock_RIC", "stock_name", "date"]]

    if index_file_path == False:
        df = etf_data

    else:
        index_data = pd.read_csv(index_file_path)
        df = pd.concat([index_data, etf_data], ignore_index=True)
    
    # Create a pivot table with 'stock_RIC' as columns and 'date' as rows
    pivot_df = df.pivot_table(index='date', columns='stock_RIC', aggfunc='size', fill_value=0)
    
    # Convert pivot table to long format
    pivot_df.reset_index(inplace=True)
    long_df = pivot_df.melt(id_vars=['date'], var_name='stock_RIC', value_name='member')
    
    # Adjust the 'member' column to be binary (1 if member, 0 if not)
    long_df['member'] = (long_df['member'] > 0).astype(int)
    long_df.columns = ["date", "stock_RIC", "index_member"]
    
    # Convert 'date' from string to date format and ensure it's just the date without time
    long_df['date'] = pd.to_datetime(long_df['date']).dt.date
    
    # Save the processed data to a specified output file path
    long_df.to_csv(output_file_path, index=False)

### STOXX Europe 50

In [10]:
index_proxi_file_path = '/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_proxi_data/stoxx_europe_50_index_proxi_merge.csv'
output_file_path  = "/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/STOXX_Europe_50/formated_constituents_stoxx_europe_50.csv"

process_index_etf_data(index_proxi_file_path, output_file_path)

### STOXX Europe 600

# merge index membership 50 and 600

In [11]:
import pandas as pd

# Load the datasets from CSV files
df_600 = pd.read_csv('/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/STOXX_Europe_600/formated_constituents_stoxx_europe_600.csv')
df_50 = pd.read_csv('/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/STOXX_Europe_50/formated_constituents_stoxx_europe_50.csv')

# Rename 'index_member' columns to differentiate them
df_600.rename(columns={'index_member': 'index_member_600'}, inplace=True)
df_50.rename(columns={'index_member': 'index_member_50'}, inplace=True)

# Merge the two dataframes on 'date' and 'stock_RIC'
merged_df = pd.merge(df_600, df_50, on=['date', 'stock_RIC'], how='left')

# Fill missing 'index_member_50' values with 0
merged_df['index_member_50'].fillna(0, inplace=True)
merged_df['index_member_50'] = merged_df['index_member_50'].astype(int)  # Ensure it's an integer

display(merged_df)
merged_df.to_csv('/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/index_constituents_data/formated_constituents.csv', index=False)


Unnamed: 0,date,stock_RIC,index_member_600,index_member_50
0,2009-12-31,0MW4EUR.xbo^K15,0,0
1,2010-01-31,0MW4EUR.xbo^K15,0,0
2,2010-02-28,0MW4EUR.xbo^K15,0,0
3,2010-03-31,0MW4EUR.xbo^K15,0,0
4,2010-04-30,0MW4EUR.xbo^K15,0,0
...,...,...,...,...
196219,2023-07-31,ZURN.S,1,1
196220,2023-08-31,ZURN.S,1,1
196221,2023-09-30,ZURN.S,1,1
196222,2023-10-31,ZURN.S,1,1
