In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import statsmodels.api as sm
import statistics
import matplotlib.pyplot as plt
import nonlincausality as nlc
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import csv
import ipynbname
import time
from numba import cuda
import tensorflow as tf


# Ignore all warnings
warnings.filterwarnings('ignore')

# If you specifically want to ignore warnings from pandas, you can do so as follows
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')


# Set options to display all columns
pd.set_option('display.max_columns', None)

# Optionally, set the display width to ensure that pandas does not wrap text
pd.set_option('display.width', None)

# Set the column display length to maximum
pd.set_option('display.max_colwidth', None)

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [None]:
###########GPU CONFIG################
'''
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
gpu_number = 0 #### GPU number 
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    tf.config.experimental.set_visible_devices(gpus[gpu_number], 'GPU') 
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
'''
###########GPU CONFIG################

In [None]:
###########GPU CONFIG################
'''
import tensorflow as tf
if tf.test.is_gpu_available():
    print("GPU available: Yes")
else:
    print("GPU available: No")
'''
###########GPU CONFIG################

In [None]:
#https://www.sciencedirect.com/science/article/pii/S0169260722000542?via%3Dihub
def non_linear_granger_test(filename,data_train,data_val,data_test,lags):
    result = nlc.nonlincausalityNN(
    x=data_train,
    maxlag=lags,
    NN_config=['d','dr','d','dr'],
    NN_neurons=[100,0.05,100,0.05],
    x_test=data_test,
    run=5,
    epochs_num=[30,30],
    learning_rate=[0.0001, 0.00001],
    batch_size_num=32,
    x_val=data_val,
    reg_alpha=None,
    callbacks=None,
    verbose=False,
    plot=False,
    )
    return result
    

def data_transformer(data):
    
    # Update the DataFrame's column names
    new_columns = [col.replace(" - Realtime", "") for col in data.columns]
    data.columns = new_columns
    
    for column in data.select_dtypes(include=['object']).columns:
    # Replace "," with "." in the entire column and convert the column to float
        if column != "Date":
            data[column] = data[column].str.replace(",", ".").astype(float)
    
    data = data.sort_values(by='Date')
    data = data.reset_index(drop=True)
    
    #Make the modifications for the analysis
    data = data.dropna(subset=["(R1) Close", "Twitter Positive Sentiment Count", "Twitter Negative Sentiment Count"])
    
    data["Twitter Sentiment Count"] = data["Twitter Positive Sentiment Count"] - data["Twitter Negative Sentiment Count"]    
    #Drop the rows where the Twitter Sentiment Count is 0 because the % change is not calculable at these values
    #data = data[data['Twitter Sentiment Count'] != 0]
    #data['Twitter Sentiment Change'] = np.log(data['Twitter Sentiment Count'] / data['Twitter Sentiment Count'].shift(1))
    #data["Daily Sentiment Variance"] =data['Twitter Sentiment Change'] #np.absolute(data["Twitter Sentiment Change"])
    data['Twitter Sentiment Change'] = data['Twitter Positive Sentiment Count'] / data['Twitter Publication Count (L1)']
    data["Daily Sentiment Variance"] = data['Twitter Sentiment Change'] 

    
    
    data['Log Returns'] = np.log(data['(R1) Close'] / data['(R1) Close'].shift(1))
    data['Log Returns Variance'] =data['Log Returns']**2
    
    
    # data['Daily Sentiment Change'] = data['Twitter Sentiment Count'].pct_change()
    
    
    #CHANGE!
    #data['Daily Sentiment Change'] = data['Twitter Publication Count (L1)'].pct_change()
    #data['Daily Sentiment Change'] =np.log(data['Twitter Publication Count (L1)'] / data['Twitter Publication Count (L1)'].shift(1))

    #data['Daily Sentiment Variance'] = data['Daily Sentiment Change']**2
    data = data.replace(np.inf, 0)
    data = data.dropna(subset=["Daily Sentiment Variance"])
    data = data.dropna(subset=["Log Returns"])
    
    return data
    
def create_list_up_to_number(number):
    return [i for i in range(1, number + 1)]

def calculate_mse_for_lists(residuals):
    mses = []
    for inner_list in residuals:
        mse = calculate_mse_from_residuals(inner_list)
        mses.append(mse)
    return mses
    
def calculate_mse_from_residuals(residuals):
    squared_errors = [(residual) ** 2 for residual in residuals]
    mse = sum(squared_errors) / len(residuals)
    return mse
    
def calculate_mae_for_lists(residuals):
    maes = []
    for inner_list in residuals:
        mae = calculate_mae_from_residuals(inner_list)
        maes.append(mae)
    return maes
    
def calculate_mae_from_residuals(residuals):
    absolute_errors = [abs(residual) for residual in residuals]
    mae = sum(absolute_errors) / len(residuals)
    return mae
    
def calculate_rss_for_lists(residuals):
    total_residuals = []
    for inner_list in residuals:
        total_residual = calculate_total_residuals(inner_list)
        total_residuals.append(total_residual)
    return total_residuals  
    
def calculate_total_residuals(residuals):
    total_residuals = sum(residuals)
    return total_residuals
    
def write_to_csv_last_line(csv_file_path, new_data):
    # Open the CSV file in append mode with newline='' to handle new line characters correctly
    with open(csv_file_path, 'a', newline='') as csv_file:
        # Create a CSV writer object
        csv_writer = csv.writer(csv_file)
        
        # Write data into the last new line
        csv_writer.writerow(new_data)
    
    return None
    
def extend_lists(*lists):
    combined_list = []
    for lst in lists:
        combined_list.extend(lst)
    return combined_list

In [None]:
sp500_data = pd.read_excel("S&P500.xlsx")
sp500 = sp500_data["Ticker"]
sp500 = sp500.str.replace("/", "_")

#Módosítás_1
#Itt lehet beállítani a főbb paramétereket a program futtatásához

#Nem lineáris lag paraméter
n_lag=10
lags = create_list_up_to_number(n_lag)

#Az a szignifikancia szint, ami felett elfogadjuk a granger okságot
threshold = 0.05

# sp500 = sp500.head(5)
data_folder = "Twitter_Daily_5Y"

#nem lineáris kauzalitás vizsgálathoz paraméterek
#a teljes adat mekkora része teszt
test_treshold=0.3
#a train adat mekkor része legyen validációs adat
val_treshold=0.2


In [None]:
'''
%%time
## nem lineáris kauzalitás tesztelése részvényen
cwd = Path.cwd()
file_path = cwd / data_folder / 'AAPL UW Equity.csv'
data = pd.read_csv(file_path)
data = data_transformer(data)

scaler = StandardScaler()
data=scaler.fit_transform(data[['Log Returns Variance', 'Daily Sentiment Variance']])
data = pd.DataFrame(data, columns=['Log Returns Variance', 'Daily Sentiment Variance'])

lags_test = lags
train_val_df, test_df = train_test_split(data, test_size=test_treshold,shuffle=False) 
train_df, val_df = train_test_split(train_val_df, test_size=val_treshold,shuffle=False)

result=non_linear_granger_test('TSLA',train_df[['Log Returns Variance', 'Daily Sentiment Variance']].values,val_df[['Log Returns Variance', 'Daily Sentiment Variance']].values,test_df[['Log Returns Variance', 'Daily Sentiment Variance']].values,lags_test)
#result1=non_linear_granger_test('TSLA',train_df[[ 'Daily Sentiment Variance','Log Returns Variance']].values,val_df[[ 'Daily Sentiment Variance','Log Returns Variance']].values,test_df[[ 'Daily Sentiment Variance','Log Returns Variance']].values,lags_test)
'''

In [None]:
#result

In [None]:
'''
nb_fname = ipynbname.name()
csv_name_1=nb_fname+"_Sentiment.csv"
#csv_name_2=nb_fname+"_Stock.csv"

#create csv with header
header=[]
header.append("Ticker")
for i in range(1, n_lag+1):
    header.append(f"Test_statistics_{i}")
for i in range(1, n_lag+1):
    header.append(f"p_value_{i}")    
for i in range(1, n_lag+1):
    header.append(f"X_in_sample_MSE_{i}")
for i in range(1, n_lag+1):
    header.append(f"X_in_sample_MAE_{i}")
for i in range(1, n_lag+1):
    header.append(f"X_in_sample_RSS_{i}")
for i in range(1, n_lag+1):
    header.append(f"X_prediction_MSE_{i}")
for i in range(1, n_lag+1):
    header.append(f"X_prediction_MAE_{i}") 
for i in range(1, n_lag+1):
    header.append(f"X_prediction_RSS_{i}")
for i in range(1, n_lag+1):
    header.append(f"XY_in_sample_MSE_{i}")
for i in range(1, n_lag+1):
    header.append(f"XY_in_sample_MAE_{i}")
for i in range(1, n_lag+1):
    header.append(f"XY_in_sample_RSS_{i}")
for i in range(1, n_lag+1):
    header.append(f"XY_prediction_MSE_{i}")
for i in range(1, n_lag+1):
    header.append(f"XY_prediction_MAE_{i}") 
for i in range(1, n_lag+1):
    header.append(f"XY_prediction_RSS_{i}")
#header


csv_file = open(csv_name_1, 'w', newline='')
writer = csv.writer(csv_file)

#csv_file1 = open(csv_name_2, 'w', newline='')
#writer1 = csv.writer(csv_file1)

writer.writerow(header)
#writer1.writerow(header)

csv_file.close()
#csv_file1.close()
'''

In [None]:
%%capture --no-display
# Analyze the data and get results from Non Linear Granger
cwd = Path.cwd()

# File path to write the CSV data
nb_fname = ipynbname.name()
csv_name_1=nb_fname+"_Sentiment.csv"
#csv_name_2=nb_fname+"_Stock.csv"

for filename_raw in sp500:
    filename = filename_raw + ".csv"
    file_path = cwd / data_folder / filename

    if os.path.isfile(file_path):
        data = pd.read_csv(file_path)
        short_ticker = filename_raw.split(" ")[0]
        print(f"{short_ticker} loaded successfully.")
        data = data_transformer(data)

        try:
            scaler = StandardScaler()
    
            data=scaler.fit_transform(data[['Log Returns Variance', 'Daily Sentiment Variance']])
            data = pd.DataFrame(data, columns=['Log Returns Variance', 'Daily Sentiment Variance'])
            
            # Analyze the data and get results from Non Linear Granger
            train_val_df, test_df = train_test_split(data, test_size=test_treshold,shuffle=False) 
            train_df, val_df = train_test_split(train_val_df, test_size=val_treshold,shuffle=False) 
    
            ##dictionary-t add vissza
            result=non_linear_granger_test(short_ticker,train_df[['Log Returns Variance', 'Daily Sentiment Variance']].values,val_df[['Log Returns Variance', 'Daily Sentiment Variance']].values,test_df[['Log Returns Variance', 'Daily Sentiment Variance']].values,lags)
            #result1=non_linear_granger_test(short_ticker,train_df[[ 'Daily Sentiment Variance','Log Returns Variance']].values,val_df[[ 'Daily Sentiment Variance','Log Returns Variance']].values,test_df[[ 'Daily Sentiment Variance','Log Returns Variance']].values,lags)

            result = [value for value in result.values()]
            
            #device = cuda.get_current_device()
            #device.reset()
            #time.sleep(2.5)
            
            statistics = [inner_list[0] for inner_list in result]
            p_values=[inner_list[1] for inner_list in result]
            
            X_in_sample_MSE=calculate_mse_for_lists([inner_list[2] for inner_list in result])
            X_in_sample_MAE=calculate_mae_for_lists([inner_list[2] for inner_list in result])
            X_in_sample_RSS=calculate_rss_for_lists([inner_list[2] for inner_list in result])
            
            X_prediction_MSE=calculate_mse_for_lists([inner_list[3] for inner_list in result])
            X_prediction_MAE=calculate_mae_for_lists([inner_list[3] for inner_list in result])
            X_prediction_RSS=calculate_rss_for_lists([inner_list[3] for inner_list in result])
    
            XY_in_sample_MSE=calculate_mse_for_lists([inner_list[4] for inner_list in result])
            XY_in_sample_MAE=calculate_mae_for_lists([inner_list[4] for inner_list in result])
            XY_in_sample_RSS=calculate_rss_for_lists([inner_list[4] for inner_list in result])
            
            XY_prediction_MSE=calculate_mse_for_lists([inner_list[5] for inner_list in result])                                       
            XY_prediction_MAE=calculate_mae_for_lists([inner_list[5] for inner_list in result])
            XY_prediction_RSS=calculate_rss_for_lists([inner_list[5] for inner_list in result])
    
            row=extend_lists([short_ticker],statistics,p_values,X_in_sample_MSE,X_in_sample_MAE,X_in_sample_RSS,
                              X_prediction_MSE,X_prediction_MAE,X_prediction_RSS,
                              XY_in_sample_MSE,XY_in_sample_MAE,XY_in_sample_RSS,
                              XY_prediction_MSE,XY_prediction_MAE,XY_prediction_RSS)

            '''
            result1 = [value for value in result1.values()]

            statistics1 = [inner_list[0] for inner_list in result1]
            p_values1=[inner_list[1] for inner_list in result1]
            
            X_in_sample_MSE1=calculate_mse_for_lists([inner_list[2] for inner_list in result1])
            X_in_sample_MAE1=calculate_mae_for_lists([inner_list[2] for inner_list in result1])
            X_in_sample_RSS1=calculate_rss_for_lists([inner_list[2] for inner_list in result1])
            
            X_prediction_MSE1=calculate_mse_for_lists([inner_list[3] for inner_list in result1])
            X_prediction_MAE1=calculate_mae_for_lists([inner_list[3] for inner_list in result1])
            X_prediction_RSS1=calculate_rss_for_lists([inner_list[3] for inner_list in result1])
    
            XY_in_sample_MSE1=calculate_mse_for_lists([inner_list[4] for inner_list in result1])
            XY_in_sample_MAE1=calculate_mae_for_lists([inner_list[4] for inner_list in result1])
            XY_in_sample_RSS1=calculate_rss_for_lists([inner_list[4] for inner_list in result1])
            
            XY_prediction_MSE1=calculate_mse_for_lists([inner_list[5] for inner_list in result1])                                        
            XY_prediction_MAE1=calculate_mae_for_lists([inner_list[5] for inner_list in result1])
            XY_prediction_RSS1=calculate_rss_for_lists([inner_list[5] for inner_list in result1])
    
            row1=extend_lists([short_ticker],statistics1,p_values1,X_in_sample_MSE1,X_in_sample_MAE1,X_in_sample_RSS1,
                              X_prediction_MSE1,X_prediction_MAE1,X_prediction_RSS1,
                              XY_in_sample_MSE1,XY_in_sample_MAE1,XY_in_sample_RSS1,
                              XY_prediction_MSE1,XY_prediction_MAE1,XY_prediction_RSS1)
            '''

        except Exception as e:
            row=[short_ticker,e]
            row1=[short_ticker, e]
        try:
            write_to_csv_last_line(csv_name_1,row)
            del row
            del result
            del data
            del train_val_df, test_df,train_df, val_df
            del statistics,p_values,X_in_sample_MSE,X_in_sample_MAE,X_in_sample_RSS,X_prediction_MSE,X_prediction_MAE,X_prediction_RSS,XY_in_sample_MSE,XY_in_sample_MAE,XY_in_sample_RSS,XY_prediction_MSE,XY_prediction_MAE,XY_prediction_RSS
            del short_ticker
            tf.keras.backend.clear_session()
        except Exception as e:
            print(e)
        #write_to_csv_last_line(csv_name_2,row1)             
    else:
        print(f"File '{filename}' does not exist in the {data_folder} folder.")