Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
nltk.download('gutenberg') #needed to access the raw text of a book
nltk.download('punkt') #needed to tokenize sentences
nltk.download('vader_lexicon') #NLTK's vader tool relies on a sentiment lexicon!

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from pandas.core import describe
from numpy.ma import count

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\ANURAG\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANURAG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ANURAG\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
#!pip install textblob

# Data Loading

In [2]:
#load data into dataframes
price_df = pd.read_excel('Project 02 - Data.xlsx', sheet_name=0)
tweets_df = pd.read_excel('Project 02 - Data.xlsx', sheet_name=1)

In [3]:
price_df.describe()

Unnamed: 0,day,open_price,high_price,low_price,moving_average_5_day,moving_average_10_day,moving_average_50_day,moving_average_200_day,volume,next_day_close_price
count,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1300.0,1000.0
mean,650.5,51.764592,52.318354,51.232192,51.604431,51.469623,50.392154,47.170069,2325961.0,42.26556
std,375.421985,21.392828,21.626325,21.194475,21.167761,20.969752,19.489467,14.551042,1636736.0,9.833662
min,1.0,27.63,27.93,26.71,27.94,28.3,29.56,30.98,643800.0,27.6
25%,325.75,37.81,38.175,37.44,37.87,37.8475,38.6875,37.965,1474500.0,36.5375
50%,650.5,43.115,43.575,42.82,43.03,42.89,42.54,42.355,1940300.0,40.175
75%,975.25,61.7325,62.525,61.075,61.3325,61.285,60.01,54.9825,2696550.0,45.52
max,1300.0,117.51,117.66,116.57,115.49,114.07,107.12,94.5,20786500.0,71.07


In [None]:
#display the first few rows of price(Numerical Data)
price_df.head(10)

In [None]:
#display the first few rows of tweets
tweets_df.head(20)

In [None]:
tweets_df.describe()

Defining a function to clean the tweets

In [4]:
# Function to clean the tweets 
def clean_tweets(tweet):
    if isinstance(tweet, str):
        #remove all underscores from the text
        cleaned_tweet = re.sub(r'\_', ' ', tweet)
        # Remove special characters and symbols
        cleaned_tweet = re.sub(r"[^\w\s]", "", cleaned_tweet)
        #remove any excess whitespace
        cleaned_tweet = re.sub(r'\s+',' ', cleaned_tweet).strip()
        return cleaned_tweet
    else:
        return ""

In [5]:
tweets_df['cleaned_tweets'] = tweets_df.tweet.apply(clean_tweets)
print(tweets_df)

         day                                              tweet  \
0          1  #Dan ($Dan) Doubles Down on Healthy, Eco-Frien...   
1          1  RT @DvdndDiplomats: Bert's X Always Buy stocks...   
2          1          $Dan Alert From our Stock News Alerts App   
3          1  X NEW Stocks at #FusionIQ with Master Scores >...   
4          1  #AmazonPrime creates a captive audience, so "b...   
...      ...                                                ...   
100762  1313                              $Dan approaching ATHs   
100763  1313  Top X Consumer Defensive stocks with market ca...   
100764  1313  52-Week High Alert: Trading todays movement in...   
100765  1313  $Dan MKM Partners analyst Bill Kirk maintains ...   
100766  1313  Some tickers Im watching tmmr, Lately Ive been...   

                                           cleaned_tweets  
0       Dan Dan Doubles Down on Healthy EcoFriendly Pr...  
1       RT DvdndDiplomats Berts X Always Buy stocks Da...  
2              

In [None]:
#cleaned_tweets = [] 
#cleaned_tweets= tweets_df['cleaned_tweets']

In [None]:
#cleaned_tweets

Calculating polarity and subjectivity scores for each tweet

In [None]:
polarities = []
subjectivities = []

#get polarity and subjectivity scores for each sentence
for sentence in cleaned_tweets:
  #the str() function is used to ensure that the input value is a string
  blob = TextBlob(str(sentence)) 
  #get polarity and subjectivity scores
  polarity, subjectivity = blob.polarity, blob.subjectivity
  print('sentence: "{}", polarity: {:.3f}, subjectivity: {:.3f}'.format(sentence, polarity, subjectivity))
  polarities.append(polarity)
  subjectivities.append(subjectivity)

#calculate and display average polarity and subjectivity scores for all sentences
print('average polarity: {:.3f}'.format(np.mean(polarities)))
print('average subjectivity: {:.3f}'.format(np.mean(subjectivities)))

In [None]:
scores = {}

#Calculating the polarity and aggregating with respect to day
for index,row in tweets_df.iterrows():
    day = row['day']
    tweet = row['cleaned_tweets']
    #the str() function is used to ensure that the input value is a string
    blob = TextBlob(str(tweet))
    #get polarity and subjectivity scores
    polarity, subjectivity = blob.polarity, blob.subjectivity
    if day not in scores:
        scores[day] = {'polarity':[],'subjectivity':[]}
    scores[day]['polarity'].append(polarity)
    scores[day]['subjectivity'].append(subjectivity)
    
#calculate and display average polarity and subjectivity scores for all sentences
for day,scores in scores.items():
    polarity_scores = scores['polarity']
    subjectivity_scores = scores['subjectivity']
    print('Day:', day)
    print('average polarity: {:.3f}'.format(np.mean(polarity_scores)))
    print('average subjectivity: {:.3f}'.format(np.mean(subjectivity_scores)))


# Calculating the average scores on day basic and storing in a dataframe

In [6]:
scores = {}

#Calculating the polarity and aggregating with respect to day
for index,row in tweets_df.iterrows():
    day = row['day']
    tweet = row['cleaned_tweets']
    #the str() function is used to ensure that the input value is a string
    blob = TextBlob(str(tweet))
    #get polarity and subjectivity scores
    polarity, subjectivity = blob.polarity, blob.subjectivity
    if day not in scores:
        scores[day] = {'polarity':[],'subjectivity':[]}
    scores[day]['polarity'].append(polarity)
    scores[day]['subjectivity'].append(subjectivity)

# dataframe to store day and scores
tweets_avg = pd.DataFrame(columns = ['day', 'Avg_Polarity', 'Avg_Subjectivity'])

#calculate and display average polarity and subjectivity scores for all sentences
for day,scores in scores.items():
    polarity_scores = scores['polarity']
    subjectivity_scores = scores['subjectivity']
    average_polarity = np.mean(polarity_scores)
    average_subjectivity = np.mean(subjectivity_scores)
    tweets_avg = tweets_avg.append({'day': day, 'Avg_Polarity': average_polarity, 'Avg_Subjectivity': average_subjectivity}, ignore_index=True)

In [None]:
price_df.describe()

In [None]:
tweets_avg.describe()

Appending the average scores to numerical dataframe (sheet1 of given data)

In [7]:
price_df['Avg_Polarity_score'] = tweets_avg['Avg_Polarity']
price_df['Avg_Subjectivity_score'] =  tweets_avg['Avg_Subjectivity']

In [None]:
price_df

Dividing the rows based on empty cells in next_day_close_price column

In [8]:
#create a dataframe containing only those rows for which
#predictions for the next day’s closing price need to be made
days_predict = price_df[pd.isnull(price_df.next_day_close_price) == True].copy()

#remove all incomplete rows from the 'df' dataframe
price_df = price_df[pd.isnull(price_df.next_day_close_price) == False].copy()

In [None]:
price_df.describe()

In [None]:
price_df['next_day_close_price'].plot(xlabel = 'day', ylabel = 'next_day_close_price', 
                                      title = 'next_day_close_price VS day')

In [None]:
price_df.corr().style.format("{:.4}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

In [None]:
features_to_use = ['day', 'open_price', 
                   'high_price', 
                   'low_price', 
                   'volume',
                   'moving_average_5_day',
                   'moving_average_10_day',
                   'moving_average_50_day',
                   'moving_average_200_day', 
                   'next_day_close_price', 
                   'Avg_Polarity_score',
                   'Avg_Subjectivity_score']

In [None]:
#split the data into training and testing sets
df_train, df_test = train_test_split(price_df[features_to_use].copy(), train_size=0.7, shuffle=True, random_state=42)

In [None]:
#display the number of rows in the training set
print("Count of rows in training data:", len(df_train))

#display the number of rows in the testing set
print("Count of rows in testing data:", len(df_test))

In [None]:
Final_preds = ['day', 'open_price', 'high_price', 
                   'low_price', 'volume', 
               'Avg_Polarity_score',
               'Avg_Subjectivity_score']

Linear Regression

In [None]:
# Linear ML
from sklearn.linear_model import LinearRegression

model = LinearRegression()
m1 = model.fit(df_train[Final_preds], df_train.next_day_close_price)

In [None]:
#generate predictions, and save them in a new column named "Has_high_income" in the testing dataframe
df_test['lr_preds'] = m1.predict(df_test[Final_preds])

#view actual and predicted values for the first 20 women in the testing set
df_test[['next_day_close_price', 'lr_preds']].head(40)

In [None]:
# Calculate the mean squared error
mse = mean_squared_error(df_test['next_day_close_price'], df_test['lr_preds'])

# Print the accuracy (MSE) score
print('Accuracy (MSE):', mse)

In [None]:
# Calculate the R-squared score
r2 = r2_score(df_test['next_day_close_price'], df_test['lr_preds'])

# Print the R-squared score
print('R-squared score:', r2)

mae = mean_absolute_error(df_test['next_day_close_price'], df_test['lr_preds'])
print("Mean Absolute Error (MAE):", mae)

Random Forest

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
m2 = rf.fit(df_train[Final_preds], df_train.next_day_close_price)

In [None]:
# Use the forest's predict method on the test data
df_test['rf_predictions'] = m2.predict(df_test[Final_preds])

# Calculate the mean squared error
rf_mse = mean_squared_error(df_test['next_day_close_price'], df_test['rf_predictions'])

# Print the accuracy (MSE) score
print('Accuracy (MSE):', rf_mse)

mae = mean_absolute_error(df_test.next_day_close_price, df_test['rf_predictions'])
print("Mean Absolute Error (MAE):", mae)

# Calculate the R-squared score
r2 = r2_score(df_test['next_day_close_price'], df_test['rf_predictions'])
print('R-squared score:', r2)

Ridge regression

In [None]:
from sklearn.linear_model import Ridge

rg = Ridge()
m3 = rg.fit(df_train[Final_preds], df_train.next_day_close_price)

# Use the forest's predict method on the test data
df_test['rg_predictions'] = m3.predict(df_test[Final_preds])

In [None]:
# Calculate the mean squared error
rg_mse = mean_squared_error(df_test['next_day_close_price'], df_test['rg_predictions'])

# Print the accuracy (MSE) score
print('Accuracy (MSE):', rg_mse)

mae = mean_absolute_error(df_test.next_day_close_price, df_test['rg_predictions'])
print("Mean Absolute Error (MAE):", mae)

# Calculate the R-squared score
r2 = r2_score(df_test['next_day_close_price'], df_test['rg_predictions'])

print('R-squared score:', r2)

Lasso regression

In [None]:
from sklearn.linear_model import Lasso

lr = Lasso()
m4 = lr.fit(df_train[Final_preds], df_train.next_day_close_price)

# Use the forest's predict method on the test data
df_test['lr_predictions'] = m4.predict(df_test[Final_preds])

In [None]:
# Calculate the mean squared error
lr_mse = mean_squared_error(df_test['next_day_close_price'], df_test['lr_predictions'])

# Print the accuracy (MSE) score
print('Accuracy (MSE):', lr_mse)

mae = mean_absolute_error(df_test.next_day_close_price, df_test['lr_predictions'])
print("Mean Absolute Error (MAE):", mae)

# Calculate the R-squared score
r2 = r2_score(df_test['next_day_close_price'], df_test['lr_predictions'])

print('R-squared score:', r2)

SVM Regressor

In [None]:
from sklearn.svm import SVR

svm_model = SVR(kernel='poly', degree=2)
m5 = svm_model.fit(df_train[Final_preds], df_train.next_day_close_price)

df_test['svm_predictions'] = m5.predict(df_test[Final_preds])

In [None]:
# Calculate the mean squared error
svm_mse = mean_squared_error(df_test['next_day_close_price'], df_test['svm_predictions'])

# Print the accuracy (MSE) score
print('Accuracy (MSE):', svm_mse)

mae = mean_absolute_error(df_test.next_day_close_price, df_test['svm_predictions'])
print("Mean Absolute Error (MAE):", mae)

# Calculate the R-squared score
r2 = r2_score(df_test['next_day_close_price'], df_test['svm_predictions'])

print('R-squared score:', r2)

LSTM

In [None]:
#Set Target Variable
output_var = pd.DataFrame(price_df['next_day_close_price'])
#Selecting the Features
features = ['day', 'open_price', 'volume', 'Avg_Polarity_score',
               'Avg_Subjectivity_score']

In [None]:
from sklearn.preprocessing import MinMaxScaler

#Scaling
scaler = MinMaxScaler()
feature_transform = scaler.fit_transform(price_df[features])
feature_transform= pd.DataFrame(columns=features, data=feature_transform, index=price_df.index)
feature_transform.head()

In [None]:
from sklearn.model_selection import TimeSeriesSplit

timesplit= TimeSeriesSplit(n_splits=10)
for train_index, test_index in timesplit.split(feature_transform):
        X_train, X_test = feature_transform[:len(train_index)], feature_transform[len(train_index): (len(train_index)+len(test_index))]
        y_train, y_test = output_var[:len(train_index)].values.ravel(), output_var[len(train_index): (len(train_index)+len(test_index))].values.ravel()

In [None]:
#Process the data for LSTM
trainX =np.array(X_train)
testX =np.array(X_test)
X_train = trainX.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = testX.reshape(X_test.shape[0], 1, X_test.shape[1])

In [None]:
#!pip install keras
#!pip install pydot
!pip install graphviz

In [None]:
!pip show pydot
!pip show graphviz


In [None]:
from keras. utils.vis_utils import plot_model
import pydot
import graphviz
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

#Building the LSTM Model
lstm = Sequential()
lstm.add(LSTM(32, input_shape=(1, trainX.shape[1]), activation='relu', return_sequences=False))
lstm.add(Dense(1))
lstm.compile(loss='mean_squared_error', optimizer='adam')
lstm.summary()



In [None]:
from tensorflow.keras.utils import plot_model

history=lstm.fit(X_train, y_train, epochs=100, batch_size=8, verbose=1, shuffle=False)



In [None]:
#LSTM Prediction
y_pred= lstm.predict(X_test)

In [None]:
#Predicted vs True Adj Close Value – LSTM
plt.plot(y_test, label='True Value')
plt.plot(y_pred, label='LSTM Value')
plt.title("Prediction by LSTM")
plt.xlabel('Time Scale')
plt.ylabel('Scaled USD')
plt.legend()
plt.show()

In [None]:
mse = mean_squared_error(y_test, y_pred, squared = False)
print(mse)

# Pycaret

In [None]:
#!pip install --user pycaret

In [None]:
from pycaret.regression import *
s = setup(df_train, target = 'next_day_close_price',
          #ignore_features = ['moving_average_200_day'], 
          session_id = 123)

In [None]:
best = compare_models()

In [None]:
evaluate_model(best)

In [None]:
y_pred = predict_model(best)

only few features

In [None]:
#from pycaret.regression import *
s2 = setup(df_train, target = 'next_day_close_price',
          ignore_features = ['moving_average_5_day',
                             'moving_average_10_day',
                             'moving_average_50_day',
                             'moving_average_200_day'], 
          session_id = 123)

In [None]:
best_few_features = compare_models()

In [None]:
evaluate_model(best_few_features)
y_pred_few_features = predict_model(best_few_features)

In [None]:
#from pycaret.regression import *
s2 = setup(df_train, target = 'next_day_close_price',
          ignore_features = ['moving_average_5_day',
                             'moving_average_10_day',
                             'moving_average_50_day',
                             'moving_average_200_day'], normalize = True, session_id = 123)

In [None]:
best_few_features_normalized = compare_models()

In [None]:
evaluate_model(best_few_features_normalized)
y_pred_few_features_norm = predict_model(best_few_features_normalized)

In [9]:
#Set Target Variable
output_var = pd.DataFrame(price_df['next_day_close_price'])

#Selecting the Features
features = ['day', 'open_price', 'high_price', 
                   'low_price', 'volume', 
               'Avg_Polarity_score',
               'Avg_Subjectivity_score']


In [None]:
#from sklearn.model_selection import train_test_split

#split the data into training and testing sets
#df_train, df_test = train_test_split(price_df[features].copy(), train_size=0.7, shuffle=True, random_state=42)

In [10]:
from sklearn.preprocessing import MinMaxScaler

#Scaling
scaler = MinMaxScaler()
price_df[features] = scaler.fit_transform(price_df[features])
#feature_transform= pd.DataFrame(columns=features, data=feature_transform, index=price_df.index)
#feature_transform.head()

In [11]:
from sklearn.model_selection import train_test_split

#split the data into training and testing sets
df_train, df_test = train_test_split(price_df.copy(), train_size=0.7, shuffle=True, random_state=42)

In [14]:
df_test

Unnamed: 0,day,open_price,high_price,low_price,moving_average_5_day,moving_average_10_day,moving_average_50_day,moving_average_200_day,volume,next_day_close_price,Avg_Polarity_score,Avg_Subjectivity_score
521,0.521522,0.235200,0.266133,0.253809,38.02,38.19,40.10,34.37,0.076469,39.84,0.550769,0.648599
737,0.737738,0.258146,0.256751,0.271321,39.40,39.53,37.76,42.55,0.061983,39.91,0.400855,0.718709
740,0.740741,0.281781,0.284439,0.295884,39.18,39.43,37.77,42.56,0.048017,40.32,0.544441,0.618466
660,0.660661,0.422671,0.415103,0.417330,46.51,46.55,47.34,42.53,0.081975,45.78,0.655835,0.706235
411,0.411411,0.159477,0.156751,0.171708,33.55,33.16,31.86,31.57,0.085738,34.06,0.296035,0.172929
...,...,...,...,...,...,...,...,...,...,...,...,...
468,0.468468,0.312299,0.333181,0.328633,40.22,38.50,34.41,31.76,0.115744,42.85,0.636535,0.707934
935,0.935936,0.980725,0.971167,0.972026,64.91,62.63,60.50,49.73,0.125197,68.85,0.301197,0.325671
428,0.428428,0.053924,0.057208,0.069593,32.55,32.49,32.61,31.24,0.233782,31.98,0.560776,0.632741
7,0.007007,0.418082,0.418078,0.426882,45.18,44.87,40.87,42.31,0.129724,45.21,0.183858,0.567807


In [17]:
# Linear ML
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report

model = LinearRegression()
m1 = model.fit(df_train[features], df_train.next_day_close_price)


In [18]:
#generate predictions, and save them in a new column named "Has_high_income" in the testing dataframe
df_test['lr2_preds'] = m1.predict(df_test[features])

#view actual and predicted values for the first 20 women in the testing set
df_test[['next_day_close_price', 'lr2_preds']].head(40)

Unnamed: 0,next_day_close_price,lr2_preds
521,39.84,39.345142
737,39.91,38.916817
740,40.32,40.335369
660,45.78,45.776515
411,34.06,34.086399
678,44.42,45.554047
626,48.92,48.487892
513,37.33,37.77148
859,46.09,45.554055
136,37.71,37.778709


In [21]:
# Calculate the mean squared error
lr2_mse = mean_squared_error(df_test['next_day_close_price'], df_test['lr2_preds'])

# Print the accuracy (MSE) score
print('Accuracy (MSE):', lr2_mse)

mae = mean_absolute_error(df_test.next_day_close_price, df_test['lr2_preds'])
print("Mean Absolute Error (MAE):", mae)

# Calculate the R-squared score
r2 = r2_score(df_test['next_day_close_price'], df_test['lr2_preds'])

print('R-squared score:', r2)

Accuracy (MSE): 0.5581274576119781
Mean Absolute Error (MAE): 0.4080711713595214
R-squared score: 0.9946622477914075


In [None]:
days_predict['predicted_day_close_price'] =  m1.predict(days_predict[features])

In [None]:
days_predict[['day', 'predicted_day_close_price']].to_csv('result.csv', columns = ['day', 'predicted_day_close_price'])

In [None]:
#df_test[['next_day_close_price', 'Pred_next_day_close_price']].to_csv('checking.csv', columns = ['day', next_day_close_price', 'Pred_next_day_close_price'])