# Stock Market Prediction Using Numerical And Textual Analysis
### By:- Harsh Vyas
### GRIP @ The Sparks Foundation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
import tensorflow as tf
import math
from sklearn.metrics import mean_squared_error
from numpy import array
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('vader_lexicon')
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost 
import lightgbm
from sklearn.tree import DecisionTreeRegressor
%matplotlib inline

In [None]:
df = pd.read_csv('../input/apple-stock/AAPL.csv')

In [None]:
df

In [None]:
df1 = df.reset_index()['Close']

In [None]:
df1

In [None]:
plt.plot(df1)

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df

In [None]:
df.info()

In [None]:
scaling = MinMaxScaler(feature_range=(0,1))
df1 = scaling.fit_transform(np.array(df1).reshape(-1,1))

In [None]:
train_size = int(len(df1)*0.65)
test_size = len(df1) - train_size
train_data, test_data = df1[0:train_size,:], df1[train_size:len(df1),:1]

In [None]:
len(train_data)

In [None]:
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]
        dataX.append(a)
        dataY.append(dataset[i+time_step,0])
    return np.array(dataX), np.array(dataY)

In [None]:
time_step = 100
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

In [None]:
X_train

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [None]:
model = Sequential()
model.add(LSTM(50,return_sequences=True,input_shape=(100,1)))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer='adam')

In [None]:
model.summary()

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,batch_size=64,verbose=1)

In [None]:
tf.__version__

In [None]:
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)

In [None]:
train_predict = scaling.inverse_transform(train_predict)
test_predict = scaling.inverse_transform(test_predict)

In [None]:
math.sqrt(mean_squared_error(y_train,train_predict))

In [None]:
math.sqrt(mean_squared_error(y_test,test_predict))

In [None]:
look_back = 100
trainPredictPlot = np.empty_like(df1)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
testPredictPlot = np.empty_like(df1)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1 : len(df1)-1, :] = test_predict

plt.plot(scaling.inverse_transform(df1))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

In [None]:
len(test_data)

In [None]:
x_input = test_data[877:].reshape(1,-1)
x_input.shape

In [None]:
temp_input = list(x_input)
temp_input = temp_input[0].tolist()

In [None]:
lst_output = []
n_steps = 100
i = 0
while(i<30):
    if(len(temp_input)>100):
        x_input = np.array(temp_input[1:])
        print('{} day input {}'.format(i,x_input))
        x_input = x_input.reshape(1,-1)
        x_input = x_input.reshape((1,n_steps,1))
        yhat = model.predict(x_input,verbose=0)
        print('{} day output {}'.format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        temp_input = temp_input[1:]
        lst_output.extend(yhat.tolist())
        i+=1
    else:
        x_input = x_input.reshape((1,n_steps,1))
        yhat = model.predict(x_input, verbose=0)
        print(yhat[0])
        temp_input.extend(yhat[0].tolist())
        print(len(temp_input))
        lst_output.extend(yhat.tolist())
        i+=1
        
print(lst_output)

In [None]:
day_new = np.arange(1,101)
day_pred = np.arange(101,131)

In [None]:
len(df1)

In [None]:
df3 = df1.tolist()
df3.extend(lst_output)

In [None]:
plt.plot(day_new,scaling.inverse_transform(df1[2691:]))
plt.plot(day_pred,scaling.inverse_transform(lst_output))

In [None]:
plt.plot(df3[2500:])

In [None]:
text = pd.read_csv('../input/india-headlines-news-dataset/india-news-headlines.csv')

In [None]:
text

In [None]:
text.drop(0,inplace=True)
text.drop('headline_category', axis=1, inplace=True)

In [None]:
text

In [None]:
text.info()

In [None]:
text['Date'] = pd.to_datetime(text['publish_date'],format= '%Y%m%d')

In [None]:
text

In [None]:
text.drop('publish_date',axis=1,inplace=True)

In [None]:
text['headline_text'] = text.groupby('Date').transform(lambda x : ' '.join(x))

In [None]:
text = text.drop_duplicates()

In [None]:
text.reset_index(inplace=True, drop=True)

In [None]:
ps = PorterStemmer()

In [None]:
temp = []
for i in range(0,len(text['headline_text'])):
    news = re.sub('[^a-zA-Z]',' ',text['headline_text'][i])
    news = news.lower()
    news = news.split()
    news = [ps.stem(word) for word in news if not word in set(stopwords.words('english'))]
    news=' '.join(news)
    temp.append(news)

In [None]:
text['headline_text'] = pd.Series(temp)

In [None]:
text

In [None]:
def subjectivity(text):
  return TextBlob(text).sentiment.subjectivity

def polarity(text):
  return  TextBlob(text).sentiment.polarity

In [None]:
text['subjectivity'] = text['headline_text'].apply(subjectivity)
text['polarity'] = text['headline_text'].apply(polarity)

In [None]:
sentiment_IA = SentimentIntensityAnalyzer()

text['Compound'] = [sentiment_IA.polarity_scores(i)['compound'] for i in text['headline_text']]
text['Negative'] = [sentiment_IA.polarity_scores(i)['neg'] for i in text['headline_text']]
text['Neutral'] = [sentiment_IA.polarity_scores(i)['neu'] for i in text['headline_text']]
text['Positive'] = [sentiment_IA.polarity_scores(i)['pos'] for i in text['headline_text']]

In [None]:
text_data_merge = pd.merge(df, text, how='inner', on='Date')

In [None]:
text_data_merge

In [None]:
data = text_data_merge[['Close','subjectivity', 'polarity', 'Compound', 'Negative', 'Neutral' ,'Positive']]

In [None]:
scaling = MinMaxScaler()

new_data = pd.DataFrame(scaling.fit_transform(data))
new_data.columns = data.columns
new_data.index = data.index

In [None]:
new_data

In [None]:
X = new_data.drop('Close', axis=1)
y =new_data['Close']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 11)

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
prediction=rf.predict(X_test)

In [None]:
mean_squared_error(prediction,y_test)

In [None]:
lgb = lightgbm.LGBMRegressor()
lgb.fit(X_train, y_train)
predictions = lgb.predict(X_test)

In [None]:
mean_squared_error(predictions,y_test)

In [None]:
xgb = xgboost.XGBRegressor()
xgb.fit(X_train, y_train)
predictions = xgb.predict(X_test)

In [None]:
mean_squared_error(predictions,y_test)

In [None]:
dec_tree = DecisionTreeRegressor()
dec_tree.fit(X_train, y_train)
predictions = dec_tree.predict(X_test)

In [None]:
mean_squared_error(predictions,y_test)

In [None]:
adb = AdaBoostRegressor()
adb.fit(X_train, y_train)
predictions = adb.predict(X_test)

In [None]:
mean_squared_error(predictions, y_test)