### Import data

In [31]:
import pandas as pd
#import daily stock prices

df_raw = pd.read_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/stock_level_data.csv")

### clean data

In [56]:
df = df_raw[["stock_RIC", "date", "price"]].copy(deep=True)

# Convert 'date' column to datetime, coercing errors to NaT
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with NaT in 'date' column
df.dropna(subset=['date'], inplace=True)

# Drop rows with NaN in 'price' column and make sure the change is in-place
df.dropna(subset=["price"], inplace=True)

######## getting rid of time
df['date'] = df['date'].dt.date
df['date'] = pd.to_datetime(df['date'], errors='coerce')
#######

df.to_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/stock_level_data/cleaned_stock_level_data.csv", index=False)

df.set_index('date', inplace=True)

######################## Calculate daily returns for each stock
df['Daily_Returns'] = df.groupby('stock_RIC')['price'].pct_change()
df.dropna(subset=["Daily_Returns"], inplace = True)

display(df)
print(df.dtypes)

Unnamed: 0_level_0,stock_RIC,price,Daily_Returns
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-04,1U1.DE,4.960000,-0.017822
2010-01-05,1U1.DE,4.920000,-0.008065
2010-01-06,1U1.DE,4.780000,-0.028455
2010-01-07,1U1.DE,4.640000,-0.029289
2010-01-08,1U1.DE,4.570000,-0.015086
...,...,...,...
2023-12-21,ZURN.S,467.319907,-0.004237
2023-12-22,ZURN.S,467.883798,0.001207
2023-12-27,ZURN.S,468.655209,0.001649
2023-12-28,ZURN.S,466.683790,-0.004207


stock_RIC         object
price            float64
Daily_Returns    float64
dtype: object


### variance ratio - working

In [57]:
import pandas as pd
import numpy as np

######################## working

def calculate_variance_ratio(data):
    # Calculate five-day returns by summing log returns over five days
    data['Five_Day_Returns'] = np.log(1 + data['Daily_Returns']).rolling(window=5, min_periods=5).sum()

    # Identify every 5th trading day, considering actual data days for non-overlapping periods
    mask = data.index.to_series().groupby(data['stock_RIC']).cumcount() % 5 == 0

    # Calculate the variance of the five-day returns for these non-overlapping periods
    five_day_var = data.loc[mask, 'Five_Day_Returns'].var(ddof=1)

    # how many times we perform a variance calculation
    num_periods = mask.sum()

    # Calculate the daily variance (average of variances over the 5-day windows)
    daily_var = data['Daily_Returns'].rolling(window=5, min_periods=5).var(ddof=1).loc[mask].mean()

    # # Count the number of actual data points in each 5-day window used for the variance calculation
    # count_returns = data['Daily_Returns'].rolling(window=5, min_periods=5).count().loc[mask]

    # Calculate the variance ratio
    variance_ratio = five_day_var / (5 * daily_var) if daily_var > 0 else np.nan
    
    # Return both the variance ratio and the count of returns in the last used window
    return pd.Series({'Variance_Ratio': variance_ratio, 'Count_Returns': num_periods})

# Apply the function within each stock group and possibly resample by quarter
variance_ratios = df.groupby('stock_RIC').apply(lambda x: x.resample('Q').apply(calculate_variance_ratio))

# Reset index if needed and clean up the DataFrame
variance_ratios.reset_index(inplace=True)
# variance_ratios.drop(columns=['level_2'], inplace=True)  # Drop extra index level if present


# Display or save results
print(variance_ratios.head())




         stock_RIC       date  Variance_Ratio  Count_Returns
0  0MW4EUR.xbo^K15 2014-09-30        0.938783            7.0
1  0MW4EUR.xbo^K15 2014-12-31        1.138249           13.0
2  0MW4EUR.xbo^K15 2015-03-31        1.363008           13.0
3  0MW4EUR.xbo^K15 2015-06-30        0.956091           13.0
4  0MW4EUR.xbo^K15 2015-09-30        1.082309           14.0


In [58]:
display(variance_ratios)

Unnamed: 0,stock_RIC,date,Variance_Ratio,Count_Returns
0,0MW4EUR.xbo^K15,2014-09-30,0.938783,7.0
1,0MW4EUR.xbo^K15,2014-12-31,1.138249,13.0
2,0MW4EUR.xbo^K15,2015-03-31,1.363008,13.0
3,0MW4EUR.xbo^K15,2015-06-30,0.956091,13.0
4,0MW4EUR.xbo^K15,2015-09-30,1.082309,14.0
...,...,...,...,...
48288,ZURN.S,2022-12-31,0.541870,13.0
48289,ZURN.S,2023-03-31,0.363773,13.0
48290,ZURN.S,2023-06-30,0.091620,12.0
48291,ZURN.S,2023-09-30,1.560581,13.0


In [53]:
display(variance_ratios)

Unnamed: 0,stock_RIC,Variance_Ratio,Five_Day_Variance,Daily_Variance
0,0MW4EUR.xbo^K15,"[0.9899525941564677, 1.2399895240146113, 1.211...",date 2014-09-30 8.111848e-04 2014-12-31 ...,date 2014-09-30 1.638836e-04 2014-12-31 ...
1,1COV.DE,"[1.4636806377192533, 1.285175215297745, 1.0939...",date 2015-12-31 0.002388 2016-03-31 0.00...,date 2015-12-31 0.000326 2016-03-31 0.00...
2,1U1.DE,"[1.593144661506048, 0.24033873807699696, 0.660...",date 2010-03-31 0.004594 2010-06-30 0.00...,date 2010-03-31 0.000577 2010-06-30 0.00...
3,A2.MI,"[2.135346142397572, 1.297300927999113, 1.40868...",date 2010-03-31 0.001497 2010-06-30 0.00...,date 2010-03-31 0.000140 2010-06-30 0.00...
4,AAAA.L^C21,"[nan, 1.1809050724638672, 1.3162101482025728, ...",date 2014-06-30 NaN 2014-09-30 0.00...,date 2014-06-30 0.000266 2014-09-30 0.00...
...,...,...,...,...
1008,ZO1G.DE^A22,"[0.7600914437290106, 0.5645998725626117, 0.453...",date 2010-03-31 0.001415 2010-06-30 0.00...,date 2010-03-31 0.000372 2010-06-30 0.00...
1009,ZODC.PA^C18,"[0.652244736652197, 0.8315686613307806, 0.7972...",date 2010-03-31 0.001054 2010-06-30 0.00...,date 2010-03-31 0.000323 2010-06-30 0.00...
1010,ZODCEUR.xbo^C18,"[1.8288379933542076, 1.384096373619694, 1.2675...",date 2014-09-30 0.000846 2014-12-31 0.00...,date 2014-09-30 0.000092 2014-12-31 0.00...
1011,ZOTssEUR.xbo^E22,"[1.6032775254656204, 1.2124476545521203, 0.782...",date 2014-09-30 9.244114e-04 2014-12-31 ...,date 2014-09-30 1.153152e-04 2014-12-31 ...


### testing variance ratio

In [45]:
import pandas as pd
import numpy as np


def calculate_quarterly_variance_ratio(data):
    # Ensure daily returns are calculated if not present
    if 'Daily_Returns' not in data.columns:
        data['Daily_Returns'] = data['price'].pct_change()

    # Calculate five-day returns by summing log returns over five days
    data['Five_Day_Returns'] = np.log(1 + data['Daily_Returns']).rolling(window=5, min_periods=5).sum()

    # Identify non-overlapping 5th days
    mask = data.index.to_series().groupby(data['stock_RIC']).cumcount() % 5 == 0
    non_overlapping_five_day_returns = data.loc[mask, 'Five_Day_Returns']

    # Calculate the variance of these non-overlapping five-day returns for each quarter
    five_day_var = non_overlapping_five_day_returns.resample('Q').var(ddof=1)

    # Calculate the variance of all daily returns for each quarter
    daily_var = data['Daily_Returns'].resample('Q').var(ddof=1)

    # Calculate the variance ratio
    # This needs to handle Series, so use numpy where for vectorized conditional operation
    variance_ratio = np.where(daily_var > 0, five_day_var / (5 * daily_var), np.nan)

    # Return results as a Series
    return pd.Series({
        'Variance_Ratio': variance_ratio,
        'Five_Day_Variance': five_day_var,
        'Daily_Variance': daily_var
    })

# Assuming df is prepared
variance_ratios = df.groupby('stock_RIC').apply(calculate_quarterly_variance_ratio)

# Reset index and cleanup
variance_ratios.reset_index(inplace=True)
if 'level_2' in variance_ratios.columns:
    variance_ratios.drop(columns=['level_2'], inplace=True)

print(variance_ratios.head())




         stock_RIC                                     Variance_Ratio  \
0  0MW4EUR.xbo^K15  [0.9899525941564677, 1.2399895240146113, 1.211...   
1          1COV.DE  [1.4636806377192533, 1.285175215297745, 1.0939...   
2           1U1.DE  [1.593144661506048, 0.24033873807699696, 0.660...   
3            A2.MI  [2.135346142397572, 1.297300927999113, 1.40868...   
4       AAAA.L^C21  [nan, 1.1809050724638672, 1.3162101482025728, ...   

                                   Five_Day_Variance  \
0  date
2014-09-30    8.111848e-04
2014-12-31    ...   
1  date
2015-12-31    0.002388
2016-03-31    0.00...   
2  date
2010-03-31    0.004594
2010-06-30    0.00...   
3  date
2010-03-31    0.001497
2010-06-30    0.00...   
4  date
2014-06-30         NaN
2014-09-30    0.00...   

                                      Daily_Variance  
0  date
2014-09-30    1.638836e-04
2014-12-31    ...  
1  date
2015-12-31    0.000326
2016-03-31    0.00...  
2  date
2010-03-31    0.000577
2010-06-30    0.00...  
3  d

In [59]:
# Create a date range for quarters between 2010 and 2023
quarter_dates = pd.date_range(start='2010-01-01', end='2023-12-31', freq='Q')

# Assuming you have a list of all unique stock_RICs from your original DataFrame
stocks = df['stock_RIC'].unique()

# Create a DataFrame from all combinations of quarter_dates and stocks
quarters = pd.MultiIndex.from_product([quarter_dates, stocks], names=['date', 'stock_RIC'])
df_quarters = pd.DataFrame(index=quarters).reset_index()

# Format the 'date' to datetime if not already
df_quarters['date'] = pd.to_datetime(df_quarters['date'])


# Merge the complete quarters DataFrame with the variance_ratios
final_df = df_quarters.merge(variance_ratios, on=['date', 'stock_RIC'], how='left')

# Fill NA for quarters with no data
final_df.fillna('NA', inplace=True)

# Check the final DataFrame
print(final_df.head())

# Optionally, you might want to sort or reindex based on your needs
final_df.sort_values(by=['stock_RIC', 'date'], inplace=True)


        date stock_RIC Variance_Ratio Count_Returns
0 2010-03-31    1U1.DE        1.79202          13.0
1 2010-03-31     A2.MI       2.733786          13.0
2 2010-03-31    AAK.ST       1.365109          13.0
3 2010-03-31     AAL.L       1.017296          13.0
4 2010-03-31   AALB.AS       1.146916          13.0


In [61]:
final_df.to_csv("/Users/jonathanzeh/Library/CloudStorage/OneDrive-Personal/BA_Thesis/BA_coding/datasets/eikon_data/variable_data/variance_ratio.csv", index=False)

## create quaterly data set

In [52]:
# Create a date range of quarters
quarters = pd.date_range(start='2010-01-01', end='2023-12-01', freq='Q')

# Get unique stock identifiers from your dataset
stocks = df['stock_RIC'].unique()

# Create a DataFrame from every combination of stock_RIC and date
df_q = pd.MultiIndex.from_product([stocks, quarters], names=['stock_RIC', 'date_quarter']).to_frame(index=False)

display(df_q)

Unnamed: 0,stock_RIC,date_quarter
0,1U1.DE,2010-03-31
1,1U1.DE,2010-06-30
2,1U1.DE,2010-09-30
3,1U1.DE,2010-12-31
4,1U1.DE,2011-03-31
...,...,...
55710,SYENS.BR,2022-09-30
55711,SYENS.BR,2022-12-31
55712,SYENS.BR,2023-03-31
55713,SYENS.BR,2023-06-30


## calculate variance for each quarter

#### GPT prompt v1
The data frame df contains return data. please calculate for each row the variance from the daily returns that are within the quarter. for 2010-03-31 is should consider all returns from 2010-01-01 till 2010-03-31. if there are no returns to calculate the variance put in a NA. the new column should be called daily_variance.

Create a second column called 5_day_variance. Identify every 5th trading day, considering actual data days for non-overlapping periods. here you first need to calculate the five_day_returns. for all five_day_returns within a quarter you should again calculate the variance: five_day_variance. count the number of five_day_returns for each quarter and put them in a new column named count. if there are no returns fill in NA for the variance and also for count

add the new columns to the df_q. df_q is a data frame with rows for each quarter between 2010-01-01 and 2023-12-31 for each stock_RIC

#### GPT promt v2
can you extract all data points for each quarter from an existing df. the new data frame df_q has a row for each stock_RIC for each quarter between 2010-01-01 to 2023-31-12. for each date_quarter go to the data set df and get all 

In [51]:
import numpy as np

# Calculate daily variance for each quarter
daily_variance = df.groupby('stock_RIC').resample('Q')['Daily_Returns'].var().rename('daily_variance').reset_index()

def calculate_5_day_stats(group):
    # Calculate 5-day returns
    group['Five_Day_Returns'] = np.log(1 + group['Daily_Returns']).rolling(window=5, min_periods=5).sum()
    
    # Correctly apply cumcount() within a groupby context
    mask = group.reset_index().groupby('stock_RIC').cumcount() % 5 == 0  # Reset index if the date is the index
    
    # Select non-overlapping five-day returns
    five_day_returns = group.loc[mask, 'Five_Day_Returns']
    
    # Calculate variance of five-day returns
    five_day_variance = five_day_returns.var()
    
    # Count valid non-overlapping 5-day periods
    count = five_day_returns.count()
    
    return pd.Series({
        'five_day_variance': five_day_variance if count > 0 else np.nan,
        'count': count
    })

# Calculate daily variance for each quarter
daily_variance = df.groupby('stock_RIC').resample('Q')['Daily_Returns'].var().rename('daily_variance').reset_index()

# Calculate 5-day variance and count for each quarter
five_day_stats = df.groupby('stock_RIC').resample('Q').apply(calculate_5_day_stats).reset_index()


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).