## Imports.

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
import re
import json
import rdata
import torch
import random
import pickle
import datasets
import numpy as np
import transformers
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from scipy import stats
from typing import Optional
import statsmodels.api as sm
from numpy.linalg import norm
from llm_measure import measure
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
from  datasets import load_dataset
from readability import Readability
from sklearn.decomposition import PCA
from nltk.tokenize import sent_tokenize
from scipy.stats.mstats import winsorize
from dataclasses import dataclass, field
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
import warnings
warnings.filterwarnings("ignore")

## Review data.

In [2]:
total_df = pd.read_csv('./data/index_matching_fiveyears.csv',index_col=0)
all_texts = total_df['qa_text'].tolist()
probing_texts = total_df['qa_text'].tolist()

global chosen_list
chosen_list = np.load('./data/chosen_list.npy').tolist()

## OLS.

In [3]:
def ols(df, out_name, minus = False, baseline = False):
    with open('./results/' + out_name + '_pc.pkl', 'rb') as file: 
         pc = pickle.load(file) 
    with open('./results/' + out_name + '_scale.pkl', 'rb') as file: 
         scaler = pickle.load(file)
    with open('./results/' + out_name + '_emd.pkl', 'rb') as file:
         embeddings = pickle.load(file)



    direction = pc.components_[0,:]
    our_llm_measure = np.dot(embeddings, direction)
    our_llm_measure = scaler.fit_transform(our_llm_measure.reshape(-1, 1))
    our_llm_measure = our_llm_measure.reshape(-1)

    if minus:
        our_llm_measure = our_llm_measure*(-1)


    regression_df = df.copy()
    regression_df['ours'] = our_llm_measure    

    regression_df = regression_df.iloc[chosen_list,:].reset_index(drop=True)
    regression_df_sub=regression_df.drop_duplicates().reset_index(drop=True)

    results = pd.DataFrame(regression_df_sub.groupby(['ticker','year'])['ours'].mean())

    results['ticker'] = results.index
    results = results.reset_index(drop=True)

    results['tic'] = results['ticker'].apply(lambda x:x[0])
    results['year'] = results['ticker'].apply(lambda x:x[1])

    results_innovation = results.drop('ticker', axis=1)
    results_innovation['year'] = results_innovation['year'].apply(lambda x:int(x))
    #######################################################################################

    results_innovation_2018 = results_innovation[results_innovation['year'].isin([2016,2017,2018])].groupby('tic').mean()
    results_innovation_2018['tic'] = results_innovation_2018.index
    results_innovation_2018 = results_innovation_2018.reset_index(drop=True)
    results_innovation_2018['year'] = [2018 for _ in range(results_innovation_2018.shape[0])]


    results_innovation_2019 = results_innovation[results_innovation['year'].isin([2017,2018,2019])].groupby('tic').mean()
    results_innovation_2019['tic'] = results_innovation_2019.index
    results_innovation_2019 = results_innovation_2019.reset_index(drop=True)
    results_innovation_2019['year'] = [2019 for _ in range(results_innovation_2019.shape[0])]


    results_innovation_2020 = results_innovation[results_innovation['year'].isin([2018,2019,2020])].groupby('tic').mean()
    results_innovation_2020['tic'] = results_innovation_2020.index
    results_innovation_2020 = results_innovation_2020.reset_index(drop=True)
    results_innovation_2020['year'] = [2020 for _ in range(results_innovation_2020.shape[0])]

    results_innovation_new = pd.concat([results_innovation_2018,results_innovation_2019]).reset_index(drop=True)
    results_innovation_new = pd.concat([results_innovation_new,results_innovation_2020]).reset_index(drop=True)
    results_innovation = results_innovation_new
    #######################################################################################


    culture_label = pd.read_csv('./data/culture_label.csv')
    culture_label['gvkey'] = culture_label['GVKEY']
    culture_label['year'] = culture_label['Year']#+1
    
    gvkey_tic = pd.read_csv('./data/gvkey_ticker.csv')
    gvkey_tic = gvkey_tic[['gvkey','tic','fyear']].drop_duplicates().reset_index(drop=True)
    gvkey_tic['year'] = gvkey_tic['fyear']



    results_innovation  = results_innovation.iloc[:,:3]
    results_innovation = results_innovation.merge(gvkey_tic,on=['tic','year'],how='inner')
    results = results_innovation.merge(culture_label,on=['gvkey','year'],how='inner')


    RD = pd.read_csv('./data/RD_all.csv')
    RD = RD.dropna(subset = ['gvkey','fyear','at']).reset_index(drop=True)
    RD['year'] = RD['fyear']
    RD['size'] = np.log(RD['at'])

    RD1 = RD.copy()
    RD1 = RD1.dropna().reset_index(drop=True)
    RD1['rd/asset'] = RD1['xrd']/RD1['at']
    RD1 = RD1[['gvkey','rd/asset','year','size']]
    results_RD = results.merge(RD1,on=['year','gvkey'],how='inner')

    s = StandardScaler()
    results_RD['ours'] = s.fit_transform(np.array(results_RD['ours']).reshape(-1, 1)).reshape(-1)
    s = StandardScaler()
    results_RD['s_innovation'] = s.fit_transform(np.array(results_RD['s_innovation']).reshape(-1, 1)).reshape(-1)
    
    #define predictor and response variables
    x = results_RD[['ours','size']]
    y = results_RD['rd/asset']
    
    #add constant to predictor variables
    x = sm.add_constant(x)

    #fit linear regression model
    ols_model = sm.OLS(y, x).fit()

    #view model summary
    print(ols_model.summary())

    #define predictor and response variables
    x = results_RD[['s_innovation','size']]
    y = results_RD['rd/asset']
    
    #add constant to predictor variables
    x = sm.add_constant(x)

    #fit linear regression model
    ols_model = sm.OLS(y, x).fit()

    #view model summary
    if baseline:
        print(ols_model.summary())
    
    #return results_RD['ours']
    return results_RD

## Result 1 (main).

In [4]:
result_RD= ols(df=total_df,
     out_name = "innovation_ori_sign",
     minus = False,
    baseline = True
    )


                            OLS Regression Results                            
Dep. Variable:               rd/asset   R-squared:                       0.140
Model:                            OLS   Adj. R-squared:                  0.140
Method:                 Least Squares   F-statistic:                     426.9
Date:                Thu, 12 Sep 2024   Prob (F-statistic):          1.81e-172
Time:                        00:50:51   Log-Likelihood:                 422.50
No. Observations:                5237   AIC:                            -839.0
Df Residuals:                    5234   BIC:                            -819.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3780      0.011     35.263      0.0

## Result 2 (definition rephrased).

In [5]:

_= ols(df=total_df,
     out_name = "innovation_repharse_def_sign",
     minus = False
    )

                            OLS Regression Results                            
Dep. Variable:               rd/asset   R-squared:                       0.139
Model:                            OLS   Adj. R-squared:                  0.138
Method:                 Least Squares   F-statistic:                     421.1
Date:                Thu, 12 Sep 2024   Prob (F-statistic):          2.54e-170
Time:                        00:50:55   Log-Likelihood:                 417.55
No. Observations:                5237   AIC:                            -829.1
Df Residuals:                    5234   BIC:                            -809.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3798      0.011     35.387      0.0

## Result 3 (instruction rephrased).

In [6]:

_= ols(df=total_df,
     out_name = "innovation_repharse_inst_sign",
     minus = True
    )

                            OLS Regression Results                            
Dep. Variable:               rd/asset   R-squared:                       0.131
Model:                            OLS   Adj. R-squared:                  0.131
Method:                 Least Squares   F-statistic:                     395.1
Date:                Thu, 12 Sep 2024   Prob (F-statistic):          1.59e-160
Time:                        00:50:59   Log-Likelihood:                 394.98
No. Observations:                5237   AIC:                            -784.0
Df Residuals:                    5234   BIC:                            -764.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3839      0.011     35.494      0.0

## Result 4 (probing size 128).

In [7]:

_= ols(df=total_df,
     out_name = "innovation_sample_sign",
     minus = True
    )

                            OLS Regression Results                            
Dep. Variable:               rd/asset   R-squared:                       0.145
Model:                            OLS   Adj. R-squared:                  0.144
Method:                 Least Squares   F-statistic:                     443.1
Date:                Thu, 12 Sep 2024   Prob (F-statistic):          1.66e-178
Time:                        00:51:02   Log-Likelihood:                 436.41
No. Observations:                5237   AIC:                            -866.8
Df Residuals:                    5234   BIC:                            -847.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3772      0.011     35.283      0.0