In [1]:
import datetime
import matplotlib.pyplot as plt
import yfinance as yf
import pandas as pd
import numpy as np
from scipy import stats

# I. Load data

In [28]:
OF = pd.read_pickle("OF_res.pkl")
DJIA = pd.read_pickle("DJIA_filled_missing_val.pkl")

In [29]:
DJIA.head()

Unnamed: 0,Dates,Close
0,2022-04-20,35160.789062
1,2022-04-21,34792.761719
2,2022-04-22,33811.398438
3,2022-04-23,33930.429688
4,2022-04-24,33989.945312


In [17]:
OF.head()

Unnamed: 0_level_0,tweets_processed,tweet_size,positive_point,negative_point,ratio,norm_ratio
date_processed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,"[customized, operator, training, helping, team...",142,34.5,1.0,34.5,4.081169
2022-05-02,"[power, higher, education, one, network, time,...",413,72.0,15.0,4.8,-0.462093
2022-05-03,"[ambulances, make, india, bharti, airtel, hosp...",425,86.0,17.5,4.914286,-0.444611
2022-05-04,"[cisco, calculates, emissions, products, math,...",438,100.0,15.0,6.666667,-0.176546
2022-05-05,"[employees, said, able, work, remotely, made, ...",474,79.0,15.0,5.266667,-0.390707


# 2. Prepare dataset

input: 
1. The past 3 days of DJIA values
2. The same combination data of our Opinion Finder mood time series (past 3 days)

Time period (in thesis): 
> Train: 2008-2-28 ~ 2008-11-28 (10 months)

> Test: 2008-12-1 ~ 2008-12-19 (19 days)

Time period (in this project): 
> Train: 2022-5-4 ~ 2022-6-10 (38 days)

> Test: 2022-6-13 ~ 2022-6-30 (18 days)

In [70]:
DJIA_prim = DJIA.copy()
for i in range(1, 4):
    DJIA_prim['DJIA_'+str(i)+'_days_before'] = DJIA_prim['Close'].shift(i)
    
DJIA_prim = DJIA_prim.query(' "2022-04-30" < Dates < "2022-7-01"').reset_index(drop=True)

Train_DJIA = DJIA_prim.query(' "2022-5-4" <= Dates <= "2022-6-10"').reset_index(drop=True)
Test_DJIA = DJIA_prim.query(' "2022-6-13" <= Dates <= "2022-6-30"').reset_index(drop=True)

In [81]:
OF_prim = OF.drop(columns=['tweets_processed', 'tweet_size', 'positive_point', 'negative_point', 'norm_ratio'])
for i in range(1, 4):
    OF_prim['OF_'+str(i)+'_days_before'] = OF_prim['ratio'].shift(i)
    
OF_prim = OF_prim.reset_index()
OF_prim = OF_prim.rename({'date_processed': 'Dates'}, axis=1) 
OF_prim = OF_prim.drop(columns=['ratio'])

Train_OF = OF_prim.query(' "2022-05-04" <= Dates <= "2022-06-10"').reset_index(drop=True)
Train_OF = Train_OF.drop(columns=['Dates'])
Test_OF = OF_prim.query(' "2022-06-13" <= Dates <= "2022-06-30"').reset_index(drop=True)
Test_OF = Test_OF.drop(columns=['Dates'])

In [82]:
Train = pd.concat([Train_DJIA, Train_OF], axis=1)
Test = pd.concat([Test_DJIA, Test_OF], axis=1)

In [83]:
Train

Unnamed: 0,Dates,Close,DJIA_1_days_before,DJIA_2_days_before,DJIA_3_days_before,OF_1_days_before,OF_2_days_before,OF_3_days_before
0,2022-05-04,34061.058594,33128.789062,33061.5,33040.427734,4.914286,4.8,34.5
1,2022-05-05,32997.96875,34061.058594,33128.789062,33061.5,6.666667,4.914286,4.8
2,2022-05-06,32899.371094,32997.96875,34061.058594,33128.789062,5.266667,6.666667,4.914286
3,2022-05-07,32572.535156,32899.371094,32997.96875,34061.058594,5.542857,5.266667,6.666667
4,2022-05-08,32409.117188,32572.535156,32899.371094,32997.96875,4.6,5.542857,5.266667
5,2022-05-09,32245.699219,32409.117188,32572.535156,32899.371094,24.0,4.6,5.542857
6,2022-05-10,32160.740234,32245.699219,32409.117188,32572.535156,4.2,24.0,4.6
7,2022-05-11,31834.109375,32160.740234,32245.699219,32409.117188,5.016667,4.2,24.0
8,2022-05-12,31730.300781,31834.109375,32160.740234,32245.699219,7.4,5.016667,4.2
9,2022-05-13,32196.660156,31730.300781,31834.109375,32160.740234,3.95,7.4,5.016667


In [84]:
Test

Unnamed: 0,Dates,Close,DJIA_1_days_before,DJIA_2_days_before,DJIA_3_days_before,OF_1_days_before,OF_2_days_before,OF_3_days_before
0,2022-06-13,30516.740234,30735.752441,30954.764648,31392.789062,3.65,15.6,3.892308
1,2022-06-14,30364.830078,30516.740234,30735.752441,30954.764648,3.021053,3.65,15.6
2,2022-06-15,30668.529297,30364.830078,30516.740234,30735.752441,2.568421,3.021053,3.65
3,2022-06-16,29927.070312,30668.529297,30364.830078,30516.740234,3.508333,2.568421,3.021053
4,2022-06-17,29888.779297,29927.070312,30668.529297,30364.830078,13.12,3.508333,2.568421
5,2022-06-18,30209.514648,29888.779297,29927.070312,30668.529297,7.425,13.12,3.508333
6,2022-06-19,30369.882324,30209.514648,29888.779297,29927.070312,15.8,7.425,13.12
7,2022-06-20,30450.066162,30369.882324,30209.514648,29888.779297,3.85,15.8,7.425
8,2022-06-21,30530.25,30450.066162,30369.882324,30209.514648,3.4,3.85,15.8
9,2022-06-22,30483.130859,30530.25,30450.066162,30369.882324,2.2,3.4,3.85


# III. Build SOFNN model to predict DJIA

# IV. Compute MAPE & Direction correct rate