# Introduction to Stock Analysis using PySpark

## 1. Introduction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler, StringIndexer
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [None]:
# init spark session
spark = SparkSession \
        .builder \
        .appName("StockAnalysis") \
        .getOrCreate()
        
# Multiple Stocks
# df = spark.read.format('parquet').load('../data/**.parquet')
# Single Stocks
df = spark.read.format('parquet').load('../data/AMZN.parquet')

## Lowest Return

In [None]:
df = df.withColumn("Return", F.log(df["LastTradePrice"]/df["FirstTradePrice"]) * 100)
df.select('Return').describe().show()

## Month with the Highest Average Return

In [None]:
df = df.withColumn('Month', F.month(df['TradeDate']))
df.groupBy('Month').agg({'Return': 'mean'}).orderBy('avg(Return)').collect()

## Day with the Highest Average Returns

In [None]:
df = df.withColumn('Date', F.dayofyear(df['TradeDate']))
df.groupBy('Date').agg({'Return': 'mean'}).orderBy('avg(Return)').collect()

## 50-minute moving average

In [None]:
# Create a window between today and 50 days ago.

windowSpec = Window.orderBy(F.col("TradeDate")).rowsBetween(-50, 0)

# Calculate new moving average column using 'avg' and the windowSpec
df = df.withColumn('50DMA', F.avg("LastTradePrice").over(windowSpec)) 

In [None]:
deviation = F.log(F.col('LastTradePrice')/F.col('50DMA'))*100
df = df.withColumn("50_DMA_DEV", deviation)
df.select(['TradeDate','LastTradePrice','50DMA','50_DMA_DEV']).show()

In [None]:
df.orderBy(df["50_DMA_DEV"].desc()).head(1)[0].asDict()


## Most Active Trading Month

In [None]:
df.groupBy('Month').agg({'Volume': 'mean'}).orderBy('avg(Volume)').collect()

## Most Active Trading Day

In [None]:
df.groupBy('Date').agg({'Volume': 'mean'}).orderBy('avg(Volume)').collect()

## Describe 1-minute Returns

In [None]:
df.groupBy('Ticker').agg({'Return':'mean'}).orderBy('mean(Return)').collect()

## 

In [None]:
df.groupBy('Ticker').agg({'Return':'mean'}).orderBy('avg(Return)').collect()

In [None]:
df.groupBy('Ticker').agg({'Return':'mean'}).orderBy('avg(Return)').collect()

## 25th, 50th, and 75th percentiles of return

In [None]:
df.select(
    F.percentile_approx("Return", [0.25, 0.5, 0.75], 1000000).alias("quantiles")
).collect()

## 1 minute return correlation between returns and uptick volume

In [None]:
df.groupBy('Ticker').agg(F.corr('Return','UpTickVolume').alias('corr')).collect()

## 1 minute return kurtosis

In [None]:
df.groupBy('Ticker').agg(F.kurtosis('Return')).collect()

## 1 minute return skewness

In [None]:
df.groupBy('Ticker').agg(F.skewness('Return')).collect()

## 60 Min Moving Average

In [None]:
windowSpec1 = Window.orderBy(F.col("BarDateTime")).rowsBetween(-60, 0)

# Calculate new moving average column using 'avg' and the windowSpec
df = df.withColumn('60MMA', F.avg("LastTradePrice").over(windowSpec)) 

In [None]:
deviation1 = F.log(F.col('LastTradePrice')/F.col('60MMA'))*100
df = df.withColumn("60_MMA_DEV", deviation1)
df.select(['BarDateTime','LastTradePrice','60MMA','60_MMA_DEV']).show()