## Imports.

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
import re
import json
import rdata
import torch
import random
import pickle
import datasets
import numpy as np
import transformers
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from scipy import stats
from typing import Optional
import statsmodels.api as sm
from numpy.linalg import norm
from llm_measure import measure
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
from  datasets import load_dataset
from readability import Readability
from sklearn.decomposition import PCA
from nltk.tokenize import sent_tokenize
from scipy.stats.mstats import winsorize
from dataclasses import dataclass, field
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments

## Review data.

In [2]:
total_df = pd.read_csv('./data/pc_data.csv',index_col=0)
all_texts = total_df['sentence'].tolist()
probing_texts = total_df['sentence'].tolist()

## OLS.

In [3]:
def ols(df,
        out_name, 
        minus=False, 
        measure = '10 Yr',
        baseline = False
       ):
    
    with open('./results/' + out_name + '_pc.pkl', 'rb') as file: 
         pc = pickle.load(file) 
    with open('./results/' + out_name + '_scale.pkl', 'rb') as file: 
         scaler = pickle.load(file)
    with open('./results/' + out_name + '_emd.pkl', 'rb') as file:
         embeddings = pickle.load(file)
            
    direction = pc.components_[0,:]
    our_llm_measure = np.dot(embeddings, direction)

    our_llm_measure = scaler.fit_transform(our_llm_measure.reshape(-1, 1))
    our_llm_measure = our_llm_measure.reshape(-1)
    if minus:
        our_llm_measure = our_llm_measure *(-1)
        


    regression_df = df.copy()
    regression_df['ours'] = our_llm_measure
    regression_df_sub=regression_df.drop_duplicates().reset_index(drop=True)

    results = pd.DataFrame(regression_df_sub.groupby('file_path')['ours'].mean())

    results['file_path'] = results.index
    df_ours = results.reset_index(drop=True)

    

    master_file_path = "./data/aggregate_measure_pc.xlsx"

    df_master = pd.read_excel(master_file_path)
    df_ours['labeled_data_path'] = df_ours['file_path'].apply(lambda x: '..'+x.split('-main')[1])
    df_measure_pc = df_master.merge(df_ours,on='labeled_data_path',how='inner')
    df_measure_pc["EndDate"] = pd.to_datetime(df_measure_pc["EndDate"], format='%B/%d/%Y')


    df_treasury = pd.read_csv("./data/daily-treasury-rates.csv", usecols=["Date", "1 Yr", "10 Yr", "3 Mo"])
    df_treasury['Date'] = df_treasury['Date'].astype('datetime64[ns]')
    df_treasury['slope_10_1'] = df_treasury['10 Yr'] - df_treasury['1 Yr']
    df_treasury['slope_10y_3m'] = df_treasury['10 Yr'] - df_treasury['3 Mo']
    df_treasury = df_treasury.iloc[::-1]


    ######################## Treasury Yield Analysis ########################
    df_merge = pd.merge(df_measure_pc, df_treasury, left_on="EndDate", right_on="Date", how="left") 
    df_merge = df_merge.dropna()
    
    # run OLS
    var_x = df_merge["ours"].tolist()
    s = StandardScaler()
    var_x = s.fit_transform(np.array(var_x).reshape(-1, 1)).reshape(-1)
    var_x = sm.add_constant(var_x)
    var_y = df_merge[measure].tolist() #"1 Yr", "10 Yr", "slope_10_1", slope_10y_3m, 3 Mo
    result = sm.OLS(var_y, var_x).fit()
    print(result.summary())    
    
    #if baseline:
    var_x = df_merge["our_measure"].tolist()
    s = StandardScaler()
    var_x = s.fit_transform(np.array(var_x).reshape(-1, 1)).reshape(-1)      
    var_x = sm.add_constant(var_x)
    var_y = df_merge[measure].tolist() #"1 Yr", "10 Yr", "slope_10_1", slope_10y_3m, 3 Mo
    result = sm.OLS(var_y, var_x).fit()
    if baseline:
        print(result.summary())

    
    return df_merge
    
    
    
 

## Result 1 (main).

In [4]:
_= ols(df=total_df,
     out_name = "fomc_ori_sign",
     minus = False,
    baseline = True
    )


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.291
Model:                            OLS   Adj. R-squared:                  0.279
Method:                 Least Squares   F-statistic:                     24.57
Date:                Wed, 11 Sep 2024   Prob (F-statistic):           6.19e-06
Time:                        22:53:30   Log-Likelihood:                -55.937
No. Observations:                  62   AIC:                             115.9
Df Residuals:                      60   BIC:                             120.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0542      0.077     26.677      0.0

## Result 2 (definition rephrased).

In [5]:

_= ols(df=total_df,
     out_name = "fomc_repharse_def_sign",
     minus = False
    )

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.211
Model:                            OLS   Adj. R-squared:                  0.198
Method:                 Least Squares   F-statistic:                     16.08
Date:                Wed, 11 Sep 2024   Prob (F-statistic):           0.000170
Time:                        22:53:32   Log-Likelihood:                -59.215
No. Observations:                  62   AIC:                             122.4
Df Residuals:                      60   BIC:                             126.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0542      0.081     25.303      0.0

## Result 3 (instruction rephrased).

In [6]:

_= ols(df=total_df,
     out_name = "fomc_rephrase_inst_sign",
     minus = False
    )

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.172
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     12.49
Date:                Wed, 11 Sep 2024   Prob (F-statistic):           0.000793
Time:                        22:53:33   Log-Likelihood:                -60.713
No. Observations:                  62   AIC:                             125.4
Df Residuals:                      60   BIC:                             129.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0542      0.083     24.699      0.0

## Result 4 (probing size 128).

In [7]:

_= ols(df=total_df,
     out_name = "fomc_sample_sign",
     minus = False
    )

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.290
Model:                            OLS   Adj. R-squared:                  0.278
Method:                 Least Squares   F-statistic:                     24.48
Date:                Wed, 11 Sep 2024   Prob (F-statistic):           6.40e-06
Time:                        22:53:34   Log-Likelihood:                -55.970
No. Observations:                  62   AIC:                             115.9
Df Residuals:                      60   BIC:                             120.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0542      0.077     26.662      0.0