In [13]:
import pandas as pd
import yfinance as yf
import datetime
import json
import numpy as np

In [4]:
PLTR_raw = pd.read_csv('raw_PLTR.csv')
PLTR_raw.head()

Unnamed: 0,Date,topic,content
0,2021-06-05T11:30:00.000Z,3 Meme Stocks That Could Make You Rich,https://www.fool.com/investing/2021/06/05/3-me...
1,2021-06-04T20:05:19.000Z,Activists Launch Campaign Against Palantir's N...,"A self-touted tech-justice non-profit, Foxglov..."
2,2021-06-04T18:11:43.000Z,Palantir Backs Multiple SPACs Endeavors in Dig...,Data analytics software developer Palantir Tec...
3,2021-06-04T15:03:06.000Z,UK healthcare app built by Iranian refugee set...,A British-Iranian entrepreneur who came to the...
4,2021-06-04T14:44:35.000Z,Continue to Exercise Caution When it Comes to ...,Did Palantir (NYSE:PLTR) stock find its floor ...


In [5]:
PLTR_raw['Date'] = PLTR_raw['Date'].apply(lambda x :datetime.datetime.strptime(x[:-8],r"%Y-%m-%dT%H:%M"))

In [6]:
PLTR_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     172 non-null    datetime64[ns]
 1   topic    172 non-null    object        
 2   content  172 non-null    object        
dtypes: datetime64[ns](1), object(2)
memory usage: 4.2+ KB


In [7]:
PLTR_raw.set_index('Date',inplace=True)
PLTR_raw.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 172 entries, 2021-06-05 11:30:00 to 2021-03-04 13:56:00
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    172 non-null    object
 1   content  172 non-null    object
dtypes: object(2)
memory usage: 4.0+ KB


In [8]:
PLTR_raw.index = PLTR_raw.index.ceil('H')

In [9]:
a = PLTR_raw.index.value_counts()
a[a.values>1]

2021-05-11 12:00:00    4
2021-05-11 21:00:00    3
2021-05-11 16:00:00    3
2021-05-11 11:00:00    3
2021-05-11 15:00:00    3
2021-05-11 13:00:00    3
2021-06-04 21:00:00    2
2021-04-08 14:00:00    2
2021-05-21 18:00:00    2
2021-05-24 11:00:00    2
2021-04-05 15:00:00    2
2021-04-29 22:00:00    2
2021-05-11 18:00:00    2
2021-05-12 13:00:00    2
2021-05-06 11:00:00    2
2021-03-25 15:00:00    2
2021-05-26 19:00:00    2
Name: Date, dtype: int64

In [10]:
#process words columns
for col in PLTR_raw.columns:
    PLTR_raw[col] = PLTR_raw[col] + ' '
PLTR_raw = PLTR_raw.groupby(level=0).sum()
PLTR_raw.head(2)

Unnamed: 0_level_0,topic,content
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-04 14:00:00,Palantir Rises on Latest Ark Investment Stock ...,https://www.thestreet.com/investing/palantir-r...
2021-03-05 10:00:00,"Jim Cramer on Jobs Report, Broadcom, Costco, P...",https://www.thestreet.com/jim-cramer/stock-mar...


In [11]:
with open('word_list_extended.txt','r') as f:
    lmdict = eval(f.read())

In [12]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

special_words = {
    'crushes': 10,
    'beats': 5,
    'misses': -5,
    'trouble': -10,
    'falls': -10,
    'exploded' : 4,
    'able':2,
}

for i in lmdict["Negative"]:
  special_words[i]=-5

for i in lmdict["Positive"]:
  special_words[i]=5


vader = SentimentIntensityAnalyzer()
vader.lexicon.update(special_words)

# results = []

# for headline in news['title']:
#   pol_score = vader.polarity_scores(headline)
#   pol_score['headline'] = headline
#   results.append(pol_score)


In [14]:
def log_pos(x):
    s = vader.polarity_scores(x)
    return np.log((1 + s['pos'])/(1+s['neg']))
PLTR_raw.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 148 entries, 2021-03-04 14:00:00 to 2021-06-05 12:00:00
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    148 non-null    object
 1   content  148 non-null    object
dtypes: object(2)
memory usage: 3.5+ KB


In [15]:
PLTR_raw['topic_comp'] = PLTR_raw['topic'].apply(lambda x: vader.polarity_scores(x)['compound'])
PLTR_raw['content_comp'] = PLTR_raw['content'].apply(lambda x: vader.polarity_scores(x)['compound'])
PLTR_raw['topic_logpos'] = PLTR_raw['topic'].apply(log_pos)
PLTR_raw['content_logpos'] = PLTR_raw['content'].apply(log_pos)

In [None]:
#=======================# Price

In [16]:
PLTR_price = yf.download(tickers='PLTR', period='6mo', interval="1h")

[*********************100%***********************]  1 of 1 completed


In [17]:
PLTR_price.index = PLTR_price.index.tz_convert(None)
PLTR_price.index.is_unique

True

In [None]:
#=======================# Produce df

In [21]:
df = pd.concat([PLTR_raw.drop(['topic','content'],axis=1),PLTR_price], axis=1)
df.fillna(0,inplace=True)
df.head()

Unnamed: 0,topic_comp,content_comp,topic_logpos,content_logpos,Open,High,Low,Close,Adj Close,Volume
2020-12-09 14:30:00,0.0,0.0,0.0,0.0,28.68,28.85,26.709999,27.42,27.42,25848861.0
2020-12-09 15:30:00,0.0,0.0,0.0,0.0,27.424999,27.9,27.190001,27.41,27.41,7646543.0
2020-12-09 16:30:00,0.0,0.0,0.0,0.0,27.41,27.48,26.950001,27.089899,27.089899,7067566.0
2020-12-09 17:30:00,0.0,0.0,0.0,0.0,27.084999,27.200001,26.040001,26.1134,26.1134,10450521.0
2020-12-09 18:30:00,0.0,0.0,0.0,0.0,26.139999,26.639999,25.33,26.594999,26.594999,15899190.0


In [22]:
df_slim = df[df.index>datetime.datetime(2021,3,4,10)]
df_slim.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 617 entries, 2021-03-04 14:00:00 to 2021-06-08 19:30:00
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   topic_comp      617 non-null    float64
 1   content_comp    617 non-null    float64
 2   topic_logpos    617 non-null    float64
 3   content_logpos  617 non-null    float64
 4   Open            617 non-null    float64
 5   High            617 non-null    float64
 6   Low             617 non-null    float64
 7   Close           617 non-null    float64
 8   Adj Close       617 non-null    float64
 9   Volume          617 non-null    float64
dtypes: float64(10)
memory usage: 53.0 KB


In [None]:
#=======================#

In [23]:
print(list(map(lambda x : df[x].mask(df[x] != 0, 1).value_counts(), ['topic_comp','content_comp'])))

[0.0    925
1.0     88
Name: topic_comp, dtype: int64, 0.0    931
1.0     82
Name: content_comp, dtype: int64]


In [None]:
#========================#

In [25]:
#separate train and test
from sklearn.model_selection import train_test_split
y = df_slim['Adj Close']
X = df_slim.drop(['Adj Close','Close'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=False)

In [26]:
#normalize
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train[['Open', 'High', 'Low', 'Volume']])
X_train_scaled = pd.concat([X_train.iloc[:,:4], pd.DataFrame(X_train_scaled,index=X_train.index)], axis=1)

sc_predict = StandardScaler()
X_test_scaled = sc_predict.fit_transform(X_test[['Open','High', 'Low', 'Volume']])
X_tset_scaled = pd.concat([X_test.iloc[:,:4], pd.DataFrame(X_test_scaled,index=X_test.index)], axis=1)

In [29]:
X_train_scaled.isnull().values.any()

False

In [30]:
train_scaled = X_train_scaled.join(y_train)
train_scaled.isnull().values.any()

False

In [31]:
train_scaled = train_scaled.to_numpy()

In [40]:
print('train_scaled shape:',train_scaled.shape)
X_train = []
y_train = []

n_future = 1   # Number of days (*12 for hours) we want to predict into the future
n_past = 4*12   # Number of past days (*12 for hours) we want to use to predict the future

for i in range(n_past, len(train_scaled) - n_future +1):
    X_train.append(train_scaled[i - n_past:i, 0: train_scaled.shape[1]])
    y_train.append(train_scaled[i + n_future - 1:i + n_future, -1])

X_train, y_train = np.array(X_train), np.array(y_train)

print('X_train shape == {}.'.format(X_train.shape))
print('y_train shape == {}.'.format(y_train.shape))

train_scaled shape: (555, 9)
X_train shape == (507, 48, 9).
y_train shape == (507, 1).


In [None]:
All_x_train, All_y_train = 

In [37]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard

In [41]:
# Initializing the Neural Network based on LSTM
model = Sequential()

# Adding 1st LSTM layer
model.add(LSTM(units=64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))

# Adding 2nd LSTM layer
model.add(LSTM(units=10, return_sequences=False))

# Adding Dropout
model.add(Dropout(0.25))

# Output layer
model.add(Dense(units=1, activation='linear'))
#model.add(Dense(trainY.shape[1]))

# Compiling the Neural Network
model.compile(optimizer = Adam(learning_rate=0.01), loss='mean_squared_error')

In [42]:
%%time
es = EarlyStopping(monitor='val_loss', min_delta=1e-10, patience=10, verbose=1)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=1)
mcp = ModelCheckpoint(filepath='weights.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)

tb = TensorBoard('logs')

history = model.fit(X_train, y_train, shuffle=True, epochs=300, callbacks=[es, rlr, mcp, tb], validation_split=0.2, verbose=1, batch_size=30)

Epoch 1/300

Epoch 00001: val_loss improved from inf to 200.39421, saving model to weights.h5
Epoch 2/300

Epoch 00002: val_loss improved from 200.39421 to 166.51797, saving model to weights.h5
Epoch 3/300

Epoch 00003: val_loss improved from 166.51797 to 139.66795, saving model to weights.h5
Epoch 4/300

Epoch 00004: val_loss improved from 139.66795 to 121.86569, saving model to weights.h5
Epoch 5/300

Epoch 00005: val_loss improved from 121.86569 to 108.78497, saving model to weights.h5
Epoch 6/300

Epoch 00006: val_loss improved from 108.78497 to 99.77193, saving model to weights.h5
Epoch 7/300

Epoch 00007: val_loss improved from 99.77193 to 93.89474, saving model to weights.h5
Epoch 8/300

Epoch 00008: val_loss improved from 93.89474 to 90.23981, saving model to weights.h5
Epoch 9/300

Epoch 00009: val_loss improved from 90.23981 to 88.27910, saving model to weights.h5
Epoch 10/300

Epoch 00010: val_loss improved from 88.27910 to 87.39735, saving model to weights.h5
Epoch 11/300

