In [1]:
import pandas as pd
import yfinance as yf
import datetime
import json
import numpy as np
import os
import torch

In [2]:
stock_data = os.listdir('../data/dataset/stock_data')
stock_data

['raw_AAL.csv',
 'raw_AMC.csv',
 'raw_BB.csv',
 'raw_BBBY.csv',
 'raw_GME.csv',
 'raw_PLTR.csv']

In [10]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
with open('../data/models/wordlist/word_list_extended.txt','r') as f:
    lmdict = eval(f.read())

special_words = {
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -10,
    'exploded' : 4,
    'able':2,
}

for i in lmdict["Negative"]:
  special_words[i]=-5

for i in lmdict["Positive"]:
  special_words[i]=5

vader = SentimentIntensityAnalyzer()
vader.lexicon.update(special_words)

In [71]:
from pathlib import Path
from transformers import AutoModelForSequenceClassification, BertTokenizer
import os
import torch.nn.functional as F
import math

In [4]:
lm_path = 'ProsusAI/finbert'
MAX_LEN = 512
bertmodel = AutoModelForSequenceClassification.from_pretrained(lm_path,cache_dir=None, num_labels=3)
tokenizer = BertTokenizer.from_pretrained(lm_path)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [63]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def FinBert_predict(text):
    encoded_text = tokenizer.encode_plus(
  text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  padding=False,
  return_attention_mask=True,
  return_tensors='pt',
)
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)
    logits = bertmodel(input_ids, attention_mask).logits
    probabilities = F.softmax(logits, dim=-1)
    return probabilities.detach().numpy()


In [7]:
raw_df = pd.read_csv(f'../data/dataset/stock_data/raw_PLTR.csv')

In [43]:
i = FinBert_predict(raw_df['content'][10])
i

array([[ 0.90369976, -1.9084171 ,  0.8328257 ]], dtype=float32)

In [None]:
content_text 

In [41]:
contents_logits = [ FinBert_predict(content_text).tolist()[0] for content_text in raw_df['content'] ]

array([[ 0.90369976, -1.9084171 ,  0.8328257 ]], dtype=float32)

In [68]:
contents_logits[:3]

[[0.04011860489845276, 0.03856608644127846, 0.9213153123855591],
 [0.4383324086666107, 0.0328383669257164, 0.5288292169570923],
 [0.44296303391456604, 0.012332156300544739, 0.5447048544883728]]

In [69]:
df_logits = pd.DataFrame(contents_logits, columns=['neg', 'neu', 'pos'])

In [73]:
df_logits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   neg     172 non-null    float64
 1   neu     172 non-null    float64
 2   pos     172 non-null    float64
dtypes: float64(3)
memory usage: 4.2 KB


In [70]:
df_logits.describe()

Unnamed: 0,neg,neu,pos
count,172.0,172.0,172.0
mean,0.196786,0.184262,0.618952
std,0.277631,0.276012,0.337275
min,0.00948,0.006693,0.011943
25%,0.030732,0.032978,0.289182
50%,0.050187,0.047898,0.79287
75%,0.225097,0.172409,0.9094
max,0.954582,0.974896,0.956025


In [77]:
list_sentiment = [(N/(N+Ne+P), (P-N)/(P+N), math.log((1+P)/(1+N))) for N,Ne,P in df_logits.values]
df_sentiment = pd.DataFrame(list_sentiment, columns=['neg1','pos1','pos2'])

In [79]:
df_sentiment.describe()

Unnamed: 0,neg1,pos1,pos2
count,172.0,172.0,172.0
mean,0.196786,0.509824,0.299629
std,0.277631,0.591573,0.38525
min,0.00948,-0.95504,-0.646454
25%,0.030731,0.29878,0.072303
50%,0.050187,0.831359,0.457036
75%,0.225097,0.917926,0.607776
max,0.954582,0.956544,0.648025


In [64]:
i.logits.detach().numpy()

In [None]:
#NEW

df_list = set()
for name in stock_data:
    raw_df = pd.read_csv(f'../data/dataset/stock_data/{name}')
    print(name,"--------------------------------------")
    print(raw_df.info())
    raw_df['Date'] = raw_df['Date'].apply(lambda x :datetime.datetime.strptime(x[:-8],r"%Y-%m-%dT%H:%M"))
    raw_df.set_index('Date',inplace=True)
    raw_df.index = raw_df.index.ceil('H')
    a = raw_df.index.value_counts()
    print(a[a.values>1])

    for col in raw_df.columns:
        raw_df[col] = raw_df[col] + ' '
    raw_df = raw_df.groupby(level=0).sum()

    #applying NLP model!!!!
    raw_df = pd.concat([raw_df,])

    df_price = yf.download(tickers='PLTR', period='6mo', interval="1h")
    df_price.index = df_price.index.tz_convert(None)

    df = pd.concat([raw_df.drop(['topic','content'],axis=1),df_price], axis=1)
    df.fillna(0,inplace=True)
    
    #should subject to each range of data
    df_slim = df[df.index>datetime.datetime(2021,3,4,10)]
    print(list(map(lambda x : df[x].mask(df[x] != 0, 1).value_counts(), ['topic_comp','content_comp'])))
    df_list.append(raw_df)

In [12]:
#OLD
df_list = set()
for name in stock_data:
    raw_df = pd.read_csv(f'../data/dataset/stock_data/{name}')
    print(name,"--------------------------------------")
    print(raw_df.info())
    raw_df['Date'] = raw_df['Date'].apply(lambda x :datetime.datetime.strptime(x[:-8],r"%Y-%m-%dT%H:%M"))
    raw_df.set_index('Date',inplace=True)
    raw_df.index = raw_df.index.ceil('H')
    a = raw_df.index.value_counts()
    print(a[a.values>1])

    for col in raw_df.columns:
        raw_df[col] = raw_df[col] + ' '
    raw_df = raw_df.groupby(level=0).sum()

    #applying NLP model!!!!
    raw_df['topic_comp'] = raw_df['topic'].apply(lambda x: vader.polarity_scores(x)['compound'])
    raw_df['content_comp'] = raw_df['content'].apply(lambda x: vader.polarity_scores(x)['compound'])
    raw_df['topic_logpos'] = raw_df['topic'].apply(log_pos)
    raw_df['content_logpos'] = raw_df['content'].apply(log_pos)

    df_price = yf.download(tickers='PLTR', period='6mo', interval="1h")
    df_price.index = df_price.index.tz_convert(None)

    df = pd.concat([raw_df.drop(['topic','content'],axis=1),df_price], axis=1)
    df.fillna(0,inplace=True)
    
    #should subject to each range of data
    df_slim = df[df.index>datetime.datetime(2021,3,4,10)]
    print(list(map(lambda x : df[x].mask(df[x] != 0, 1).value_counts(), ['topic_comp','content_comp'])))
    df_list.append(raw_df)

raw_AAL.csv --------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     175 non-null    object
 1   topic    175 non-null    object
 2   content  175 non-null    object
 3   url      175 non-null    object
dtypes: object(4)
memory usage: 5.6+ KB
None
2021-04-22 15:00:00    5
2021-04-22 12:00:00    4
2021-05-10 16:00:00    2
2021-05-09 17:00:00    2
2021-03-30 17:00:00    2
2021-05-12 21:00:00    2
2021-04-22 09:00:00    2
2021-03-30 21:00:00    2
2021-04-22 14:00:00    2
2021-06-07 22:00:00    2
2021-04-13 13:00:00    2
2021-06-03 16:00:00    2
2021-03-29 21:00:00    2
2021-04-01 22:00:00    2
2021-05-11 02:00:00    2
2021-04-28 17:00:00    2
2021-04-22 13:00:00    2
2021-05-07 16:00:00    2
2021-04-14 13:00:00    2
Name: Date, dtype: int64
[*********************100%***********************]  1 of 1 completed
[0.0    9

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed