<a href="https://colab.research.google.com/github/iamchetry/Stock-Price-Prediction/blob/main/Stock_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install yfinance
!pip install pyspark

import yfinance as yf
import pandas as pd
from matplotlib.pyplot import *

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml.feature import StandardScaler, VectorAssembler, QuantileDiscretizer
from pyspark.sql.functions import rand, lead, mean, stddev, col, udf, lit
from pyspark.ml import Pipeline
from pyspark.sql.window import Window
from pyspark.sql.types import DoubleType

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras import optimizers, regularizers
from tensorflow.keras.optimizers import SGD

from google.colab import drive
drive.mount('/content/drive')

Collecting yfinance
  Downloading yfinance-0.1.63.tar.gz (26 kB)
Collecting lxml>=4.5.1
  Downloading lxml-4.6.3-cp37-cp37m-manylinux2014_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 4.9 MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.63-py2.py3-none-any.whl size=23918 sha256=812440199f8ad0463e05f06fa406a521c755d2b6418beb2c2d1ef25a80483264
  Stored in directory: /root/.cache/pip/wheels/fe/87/8b/7ec24486e001d3926537f5f7801f57a74d181be25b11157983
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Attempting uninstall: lxml
    Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfully installed lxml-4.6.3 yfinance-0.1.63
Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 66 kB/s 
[?25hColle

In [None]:
# Spark Session

conf = SparkConf().setAppName('Stock Price Prediction').setMaster('local[2]')
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

In [None]:
# Fetch Stock data from Yahoo Finance

def dump_stock_data(name=None):
  msft = yf.Ticker(name)

  df_hist = msft.history(period='max')
  df_hist['date_'] = df_hist.index
  df_hist.index = range(len(df_hist))

  df_hist.to_csv('df_stock_hist.csv', index=False)
  !mv df_stock_hist.csv '/content/drive/My Drive/'

In [None]:
dump_stock_data(name='GOOGL')

In [None]:
# Load Spark df

df_hist = sql_context.read.csv('/content/drive/My Drive/df_stock_hist.csv', header=True, inferSchema=True)[['date_', 'Open']]
df_hist.orderBy('date_', ascending=True, inplace=True)

DataFrame[date_: string, Open: double]

In [None]:
# Creating Lagged Features
for _ in range(1, 61):
  df_hist = df_hist.withColumn('Open_{}'.format(_), lead('Open', _).over(Window.orderBy('date_')))

df_hist = df_hist.drop('date_')
df_hist = df_hist.withColumnRenamed('Open_60', 'target_price').withColumnRenamed('Open', 'Open_0')
df_hist = df_hist.na.drop()

In [None]:
# Train Test Split
discretizer = QuantileDiscretizer(numBuckets=10, inputCol='target_price', outputCol='bins')

df_hist = discretizer.fit(df_hist).transform(df_hist)
df_hist.bins = df_hist.bins.astype('int')

train = df_hist.sampleBy('bins', fractions={0: 0.8, 1: 0.8, 2: 0.8, 3: 0.8, 4: 0.8, 5: 0.8, 6: 0.8, 7: 0.8, 8: 0.8, 9: 0.8}, seed=10)
test = df_hist.subtract(train)

df_hist = df_hist.drop('bins')
train = train.drop('bins')
test = test.drop('bins')

In [None]:
# Calculate Mean and STD for each column in Train data
mean_list = list()
std_list = list()

for col_ in train.columns:
  df_stats = train.select(mean(col(col_)).alias('avg_{}'.format(col_)), stddev(col(col_)).alias('std_{}'.format(col_))).collect()
  mean_list.append(df_stats[0]['avg_{}'.format(col_)])
  std_list.append(df_stats[0]['std_{}'.format(col_)])

In [None]:
# Scale Data
def z_score(x, mean_, std_):
  return (x - mean_)/std_

scale_ = udf(lambda x, mean_, std_: z_score(x, mean_, std_), DoubleType())

for _, col_ in enumerate(list(df_hist.columns)):
  mean_ = mean_list[_]
  std_ = std_list[_]

  train = train.withColumn(col_+'_scaled', scale_(df_hist[col_], lit(mean_), lit(std_)))
  test = test.withColumn(col_+'_scaled', scale_(df_hist[col_], lit(mean_), lit(std_)))

In [None]:
# Create Feature Vector and Target Variable
assembler = VectorAssembler(inputCols=['Open_{}_scaled'.format(_) for _ in range(60)], outputCol='features')

train = assembler.transform(train).select(['features', 'target_price_scaled'])
test = assembler.transform(test).select(['features', 'target_price_scaled'])

In [None]:
test.show()

+--------------------+-------------------+
|            features|target_price_scaled|
+--------------------+-------------------+
|[-1.1303337538563...|  -1.00878975315894|
|[-1.1198932345828...| -1.019233606609437|
|[-1.1230978528307...| -1.023391057709735|
|[-1.1321912093307...|-1.0088719862513797|
|[-1.1306093112925...| -1.009292302652219|
|[-1.1245981065259...|-1.0128558202281577|
|[-1.0959912620022...|-0.9973499377321561|
|[-1.0918477182529...|-0.9896472430764616|
|[-1.0871530440740...| -0.999478916838796|
|[-1.0895105867063...|-0.9959519473039415|
|[-1.0305517342706...|-1.0000454254039726|
|[-1.0311640824001...|-0.9966920590642687|
|[-1.0476974663405...|-0.9767180671187524|
|[-1.0589748556868...|-0.9854532578634487|
|[-1.0609241610470...|-1.0065968149804025|
|[-1.0655678075115...|-0.9929092393134547|
|[-1.0550048061662...|-0.9941244895362474|
|[-1.0540046318503...|-0.9970484024648409|
|[-1.0489936013997...|-1.0008403592259263|
|[-1.0595667927306...|-1.0020921574898034|
+----------