### variance ratio - final

In [4]:
import pandas as pd
import numpy as np

#import daily stock prices
df_raw = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/stock_level_data.csv")
df = df_raw[["stock_RIC", "date", "price", "return1D", "market_cap"]].copy(deep=True)

# Convert 'date' column to datetime, coercing errors to NaT
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with NaT in 'date' column
df.dropna(subset=['date'], inplace=True)

# Drop rows with NaN in 'price' column and make sure the change is in-place
df.dropna(subset=["price"], inplace=True)

######## getting rid of time
df['date'] = df['date'].dt.date
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.set_index('date', inplace=True)

######################## Calculate daily returns for each stock
df['Daily_Returns'] = df.groupby('stock_RIC')['price'].pct_change()
df.dropna(subset=["Daily_Returns"], inplace = True)

##################################################### variance ratio
def calculate_variance_ratio(data):
    # Calculate five-day returns by summing log returns over five days
    data['Five_Day_Returns'] = np.log(1 + data['Daily_Returns']).rolling(window=5, min_periods=5).sum()

    # Identify every 5th trading day, considering actual data days for non-overlapping periods
    mask = data.index.to_series().groupby(data['stock_RIC']).cumcount() % 5 == 0

    # Calculate the variance of the five-day returns for these non-overlapping periods
    five_day_var = data.loc[mask, 'Five_Day_Returns'].var(ddof=1)

    # how many times we perform a variance calculation
    num_periods = mask.sum()

    # Calculate the daily variance (average of variances over the 5-day windows)
    daily_var = data['Daily_Returns'].rolling(window=5, min_periods=5).var(ddof=1).loc[mask].mean()

    # # Count the number of actual data points in each 5-day window used for the variance calculation
    # count_returns = data['Daily_Returns'].rolling(window=5, min_periods=5).count().loc[mask]

    # Calculate the variance ratio
    variance_ratio = five_day_var / (5 * daily_var) if daily_var > 0 else np.nan
    abs_variance_ratio = abs((five_day_var / (5 * daily_var)) - 1) if daily_var > 0 else np.nan
    
    # Return both the variance ratio and the count of returns in the last used window
    return pd.Series({'Variance_Ratio': variance_ratio, "abs_Variance_Ratio": abs_variance_ratio, 'Count_Returns': num_periods})

# Apply the function within each stock group and possibly resample by quarter
variance_ratios = df.groupby('stock_RIC').apply(lambda x: x.resample('Q').apply(calculate_variance_ratio))

# Reset index if needed and clean up the DataFrame
variance_ratios.reset_index(inplace=True)
# variance_ratios.drop(columns=['level_2'], inplace=True)  # Drop extra index level if present


# Display or save results
print(variance_ratios.head())
display(variance_ratios)



         stock_RIC       date  Variance_Ratio  abs_Variance_Ratio  \
0  0MW4EUR.xbo^K15 2014-09-30        0.963476            0.036524   
1  0MW4EUR.xbo^K15 2014-12-31        1.138249            0.138249   
2  0MW4EUR.xbo^K15 2015-03-31        1.363008            0.363008   
3  0MW4EUR.xbo^K15 2015-06-30        0.956091            0.043909   
4  0MW4EUR.xbo^K15 2015-09-30        1.082309            0.082309   

   Count_Returns  
0           11.0  
1           13.0  
2           13.0  
3           13.0  
4           14.0  


Unnamed: 0,stock_RIC,date,Variance_Ratio,abs_Variance_Ratio,Count_Returns
0,0MW4EUR.xbo^K15,2014-09-30,0.963476,0.036524,11.0
1,0MW4EUR.xbo^K15,2014-12-31,1.138249,0.138249,13.0
2,0MW4EUR.xbo^K15,2015-03-31,1.363008,0.363008,13.0
3,0MW4EUR.xbo^K15,2015-06-30,0.956091,0.043909,13.0
4,0MW4EUR.xbo^K15,2015-09-30,1.082309,0.082309,14.0
...,...,...,...,...,...
48291,ZURN.S,2022-12-31,0.541870,0.458130,13.0
48292,ZURN.S,2023-03-31,0.363773,0.636227,13.0
48293,ZURN.S,2023-06-30,0.091620,0.908380,12.0
48294,ZURN.S,2023-09-30,1.560581,0.560581,13.0


In [5]:
# Create a date range for quarters between 2010 and 2023
quarter_dates = pd.date_range(start='2010-01-01', end='2023-12-31', freq='Q')

# Assuming you have a list of all unique stock_RICs from your original DataFrame
stocks = df['stock_RIC'].unique()

# Create a DataFrame from all combinations of quarter_dates and stocks
quarters = pd.MultiIndex.from_product([quarter_dates, stocks], names=['date', 'stock_RIC'])
df_quarters = pd.DataFrame(index=quarters).reset_index()

# Format the 'date' to datetime if not already
df_quarters['date'] = pd.to_datetime(df_quarters['date'])


# Merge the complete quarters DataFrame with the variance_ratios
final_df = df_quarters.merge(variance_ratios, on=['date', 'stock_RIC'], how='left')

# Fill NA for quarters with no data
final_df.fillna('NA', inplace=True)

# Check the final DataFrame
print(final_df.head())

# Optionally, you might want to sort or reindex based on your needs
final_df.sort_values(by=['stock_RIC', 'date'], inplace=True)

final_df.to_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/variance_ratio.csv", index=False)

        date stock_RIC Variance_Ratio abs_Variance_Ratio Count_Returns
0 2010-03-31    1U1.DE        1.79202            0.79202          13.0
1 2010-03-31     A2.MI       2.733786           1.733786          13.0
2 2010-03-31    AAK.ST       1.365109           0.365109          13.0
3 2010-03-31     AAL.L       1.017296           0.017296          13.0
4 2010-03-31   AALB.AS       1.146916           0.146916          13.0


### variance ratio old

In [87]:
import pandas as pd
import numpy as np


def calculate_quarterly_variance_ratio(data):
    # Ensure daily returns are calculated if not present
    if 'Daily_Returns' not in data.columns:
        data['Daily_Returns'] = data['price'].pct_change()

    # Calculate five-day returns by summing log returns over five days
    data['Five_Day_Returns'] = np.log(1 + data['Daily_Returns']).rolling(window=5, min_periods=5).sum()

    # Identify non-overlapping 5th days
    mask = data.index.to_series().groupby(data['stock_RIC']).cumcount() % 5 == 0
    non_overlapping_five_day_returns = data.loc[mask, 'Five_Day_Returns']

    # Calculate the variance of these non-overlapping five-day returns for each quarter
    five_day_var = non_overlapping_five_day_returns.resample('Q').var(ddof=1)

    # Calculate the variance of all daily returns for each quarter
    daily_var = data['Daily_Returns'].resample('Q').var(ddof=1)

    # Calculate the variance ratio
    # This needs to handle Series, so use numpy where for vectorized conditional operation
    variance_ratio = np.where(daily_var > 0, five_day_var / (5 * daily_var), np.nan)
    abs_variance_ratio = np.where(daily_var > 0, abs((five_day_var / (5 * daily_var)) - 1), np.nan)

    # Return results as a Series
    return pd.Series({
        'Variance_Ratio': variance_ratio,
        "abs_Variance_Ratio": abs_variance_ratio,
        'Five_Day_Variance': five_day_var,
        'Daily_Variance': daily_var
    })

# Assuming df is prepared
variance_ratios = df.groupby('stock_RIC').apply(calculate_quarterly_variance_ratio)

# Reset index and cleanup
variance_ratios.reset_index(inplace=True)
if 'level_2' in variance_ratios.columns:
    variance_ratios.drop(columns=['level_2'], inplace=True)

print(variance_ratios.head())




         stock_RIC                                     Variance_Ratio  \
0  0MW4EUR.xbo^K15  [0.9899525941564677, 1.2399895240146113, 1.211...   
1          1COV.DE  [1.4636806377192533, 1.285175215297745, 1.0939...   
2           1U1.DE  [1.593144661506048, 0.24033873807699696, 0.660...   
3            A2.MI  [2.135346142397572, 1.297300927999113, 1.40868...   
4       AAAA.L^C21  [nan, 1.1809050724638672, 1.3162101482025728, ...   

                                  abs_Variance_Ratio  \
0  [0.010047405843532253, 0.23998952401461127, 0....   
1  [0.4636806377192533, 0.2851752152977449, 0.093...   
2  [0.593144661506048, 0.759661261923003, 0.33962...   
3  [1.1353461423975721, 0.297300927999113, 0.4086...   
4  [nan, 0.18090507246386722, 0.31621014820257276...   

                                   Five_Day_Variance  \
0  date
2014-09-30    8.111848e-04
2014-12-31    ...   
1  date
2015-12-31    0.002388
2016-03-31    0.00...   
2  date
2010-03-31    0.004594
2010-06-30    0.00...   


In [88]:
# Create a date range for quarters between 2010 and 2023
quarter_dates = pd.date_range(start='2010-01-01', end='2023-12-31', freq='Q')

# Assuming you have a list of all unique stock_RICs from your original DataFrame
stocks = df['stock_RIC'].unique()

# Create a DataFrame from all combinations of quarter_dates and stocks
quarters = pd.MultiIndex.from_product([quarter_dates, stocks], names=['date', 'stock_RIC'])
df_quarters = pd.DataFrame(index=quarters).reset_index()

# Format the 'date' to datetime if not already
df_quarters['date'] = pd.to_datetime(df_quarters['date'])


# Merge the complete quarters DataFrame with the variance_ratios
final_df = df_quarters.merge(variance_ratios, on=['date', 'stock_RIC'], how='left')

# Fill NA for quarters with no data
final_df.fillna('NA', inplace=True)

# Check the final DataFrame
print(final_df.head())

# Optionally, you might want to sort or reindex based on your needs
final_df.sort_values(by=['stock_RIC', 'date'], inplace=True)

final_df.to_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/variance_ratio.csv", index=False)

KeyError: 'date'

## ETF_owernship at the start of each quarter

In [40]:

import pandas as pd
import numpy as np

df_vr = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/variance_ratio.csv")
df_ownership = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/monthly_panel_v1.csv", index_col=False)
df_amihud = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/monthly_panel_v1.csv", index_col=False)

################################################################ adjust dates of etf ownership
df_ownership["date"] = pd.to_datetime(df_ownership["date"])
specific_months = [3, 6, 9, 12]
df_ownership['month'] = df_ownership['date'].dt.month
df_ownership = df_ownership[df_ownership['month'].isin(specific_months)]
df_ownership = df_ownership.drop(columns=['month', "return1Mo", "price_to_BV", "DATE_monthly_std_dev", 
                                          "monthly_std_dev", "count_returns", "amihud_ratio", 
                                          "stock_value_held", "FUND_stock_value_held"])

############################################################## averaged amihud_ratio
df_amihud.drop(columns='Unnamed: 0', inplace=True)
df_amihud['date'] = pd.to_datetime(df_amihud['date'])

G = df_amihud.groupby(['stock_RIC', pd.Grouper(key='date', freq='Q')])  # If date is not set as index

# Calculate the mean of amihud_ratio for each group
quarterly_amihud_avg = G['amihud_ratio'].mean()
quarterly_amihud_avg = quarterly_amihud_avg.reset_index()

#################### merge amihud_ratio with ownership
key_columns = ['date', 'stock_RIC']
df_ownership = pd.merge(df_ownership, quarterly_amihud_avg, on=key_columns)
display(df_ownership)

################################################################ adjust dates of variance ratio
df_vr["date"] = pd.to_datetime(df_vr["date"])
df_vr["DATE_variance_ratio"] = df_vr["date"]
df_vr['date'] = df_vr['date'] - pd.offsets.MonthEnd(3)
display(df_vr)

################################################################ merge
merged_df = pd.merge(df_vr, df_ownership, on=key_columns, how='left')

################################################################ set empty rows to NA for all columns
# Identify rows where either 'variance_ratio' or 'ETF_ownership' is NaN
condition = merged_df['Variance_Ratio'].isna() | merged_df['ETF_ownership'].isna()

# Columns to be emptied
columns_to_empty = ["Variance_Ratio","ETF_ownership"]  # Add other columns as needed

# Set these columns to NaN where the condition is True
merged_df.loc[condition, columns_to_empty] = np.nan
merged_df.drop(columns='Unnamed: 0', inplace=True)

display(merged_df)
merged_df.to_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/quarterly_panel_v1.csv", index=False)


Unnamed: 0.1,Unnamed: 0,date,stock_RIC,index_member,market_cap,ETF_ownership,FUND_ownership,price,gross_profit,cumulative_return_12m,amihud_ratio
0,0,2009-12-31,0MW4EUR.xbo^K15,0,,,,,,,
1,3,2010-03-31,0MW4EUR.xbo^K15,0,,,,,,,
2,6,2010-06-30,0MW4EUR.xbo^K15,0,,,,,,,
3,9,2010-09-30,0MW4EUR.xbo^K15,1,,,,,,,
4,12,2010-12-31,0MW4EUR.xbo^K15,1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
65403,196209,2022-09-30,ZURN.S,1,6.146201e+10,0.080848,0.051847,408.493565,,0.148781,0.000003
65404,196212,2022-12-31,ZURN.S,1,6.726844e+10,0.081980,0.052762,447.084725,,0.154828,0.000003
65405,196215,2023-03-31,ZURN.S,1,6.641722e+10,0.083596,0.054211,441.427244,,-0.013498,0.000002
65406,196218,2023-06-30,ZURN.S,1,6.545078e+10,0.088283,0.058197,435.004011,,0.048664,0.000002


Unnamed: 0,date,stock_RIC,Variance_Ratio,abs_Variance_Ratio,Count_Returns,DATE_variance_ratio
0,2009-12-31,0MW4EUR.xbo^K15,,,,2010-03-31
1,2010-03-31,0MW4EUR.xbo^K15,,,,2010-06-30
2,2010-06-30,0MW4EUR.xbo^K15,,,,2010-09-30
3,2010-09-30,0MW4EUR.xbo^K15,,,,2010-12-31
4,2010-12-31,0MW4EUR.xbo^K15,,,,2011-03-31
...,...,...,...,...,...,...
56723,2022-09-30,ZURN.S,0.541870,0.458130,13.0,2022-12-31
56724,2022-12-31,ZURN.S,0.363773,0.636227,13.0,2023-03-31
56725,2023-03-31,ZURN.S,0.091620,0.908380,12.0,2023-06-30
56726,2023-06-30,ZURN.S,1.560581,0.560581,13.0,2023-09-30


Unnamed: 0,date,stock_RIC,Variance_Ratio,abs_Variance_Ratio,Count_Returns,DATE_variance_ratio,index_member,market_cap,ETF_ownership,FUND_ownership,price,gross_profit,cumulative_return_12m,amihud_ratio
0,2009-12-31,0MW4EUR.xbo^K15,,,,2010-03-31,0,,,,,,,
1,2010-03-31,0MW4EUR.xbo^K15,,,,2010-06-30,0,,,,,,,
2,2010-06-30,0MW4EUR.xbo^K15,,,,2010-09-30,0,,,,,,,
3,2010-09-30,0MW4EUR.xbo^K15,,,,2010-12-31,1,,,,,,,
4,2010-12-31,0MW4EUR.xbo^K15,,,,2011-03-31,1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56723,2022-09-30,ZURN.S,0.541870,0.458130,13.0,2022-12-31,1,6.146201e+10,0.080848,0.051847,408.493565,,0.148781,0.000003
56724,2022-12-31,ZURN.S,0.363773,0.636227,13.0,2023-03-31,1,6.726844e+10,0.081980,0.052762,447.084725,,0.154828,0.000003
56725,2023-03-31,ZURN.S,0.091620,0.908380,12.0,2023-06-30,1,6.641722e+10,0.083596,0.054211,441.427244,,-0.013498,0.000002
56726,2023-06-30,ZURN.S,1.560581,0.560581,13.0,2023-09-30,1,6.545078e+10,0.088283,0.058197,435.004011,,0.048664,0.000002


## average amihud ratio

In [38]:
df_amihud = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/monthly_panel_v1.csv", index_col=False)
df_amihud.drop(columns='Unnamed: 0', inplace=True)
df_amihud['date'] = pd.to_datetime(df_amihud['date'])

G = df_amihud.groupby(['stock_RIC', pd.Grouper(key='date', freq='Q')])  # If date is not set as index

# Calculate the mean of amihud_ratio for each group
quarterly_amihud_avg = G['amihud_ratio'].mean()

# Reset index if you want to turn the grouped data back into a DataFrame
quarterly_amihud_avg = quarterly_amihud_avg.reset_index()
