In [2]:
import pyspark.sql.functions as psf
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import time
import requests
import pandas as pandas
from dotenv import load_dotenv
from pathlib import Path
import os


spark = SparkSession.builder.getOrCreate()

# Load environment variables from .env file
load_dotenv()
# Access the API key
api_key = os.getenv("API_KEY")

# Read in csv and replace FB and ANTM tickers (they have changed)
df_csv = spark.read.csv("./input_data/stocks.csv", header=True, sep=',') \
    .withColumn('symbol', psf.regexp_replace('symbol', 'FB', 'META')) \
    .withColumn('symbol', psf.regexp_replace('symbol', 'ANTM', 'ELV')) \
    .withColumn('initial_investment', psf.lit(10000))

# convert back to a python dict to make api requests easier
pandas_df_csv = df_csv.toPandas()
stocks_dict = pandas_df_csv.to_dict(orient='records')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/08 16:30:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [11]:
def request_prices_polygon(stocks_dict, date_period):
  modified_dict =stocks_dict.copy()

  for i, stock in enumerate(stocks_dict):
    ticker = stock['symbol'] 
    # issue when requesting data for FB and ANTM - the tickers have changed over time
      #FB changed on June 9, 2022
    if date_period < '2022-06-09' and ticker=='META':
        print('changing META ticker to old version')
        ticker='FB'
        #ANTM changed on June 28, 2022
    if date_period < '2022-06-28' and ticker=='ELV':
        print('changing ELV ticker to old version')
        ticker='ANTM'
        
    url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{date_period}/{date_period}?apiKey={api_key}'
  
    try:
      response = requests.get(url=url)
      # Polygon only allows 5 requests per min - status 409 means too many requests
      if response.status_code==429:
        print('maximum requests... waiting 1 minute')
        time.sleep(60)
        response = requests.get(url=url)

      stock_data = response.json()
      print(f"response: {response.status_code}, ticker: {ticker}")
      
      # Add share price to dict
      closing_price = float(stock_data['results'][0]['c'])
      modified_dict[i][f'{date_period}_price'] = round(closing_price, 2)

    except Exception as e:
      print(f"error requesting data for {ticker}")
      raise e
  
  return modified_dict

In [12]:
current_date = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
old_date = (datetime.today() - relativedelta(years=2)).strftime('%Y-%m-%d')

# since polygon free tier is only 5 requests per min, this method takes 40 mins to run
dict_2019 = request_prices_polygon(stocks_dict=stocks_dict, date_period=old_date)
dict_current = request_prices_polygon(stocks_dict=dict_2019, date_period=current_date)

response: 200, ticker: AAPL
response: 200, ticker: MSFT
response: 200, ticker: AMZN
response: 200, ticker: TSLA
response: 200, ticker: GOOGL
maximum requests... waiting 1 minute
response: 200, ticker: GOOG
response: 200, ticker: BRK.B
response: 200, ticker: JNJ
response: 200, ticker: UNH
response: 200, ticker: NVDA
changing META ticker to old version
maximum requests... waiting 1 minute
response: 200, ticker: FB
response: 200, ticker: PG
response: 200, ticker: JPM
response: 200, ticker: XOM
response: 200, ticker: V
maximum requests... waiting 1 minute
response: 200, ticker: HD
response: 200, ticker: CVX
response: 200, ticker: MA
response: 200, ticker: ABBV
response: 200, ticker: PFE
maximum requests... waiting 1 minute
response: 200, ticker: BAC
response: 200, ticker: KO
response: 200, ticker: COST
response: 200, ticker: PEP
response: 200, ticker: AVGO
maximum requests... waiting 1 minute
response: 200, ticker: LLY
response: 200, ticker: WMT
response: 200, ticker: CSCO
response: 200, t

In [16]:

# window to rank by the %change in share price
max_window = window = Window.partitionBy("initial_investment").orderBy(psf.col('change_percentage').desc())
select_order = ['company_name', 'symbol', 'initial_investment', 'no_shares',f'{old_date}_price' ,f'{current_date}_price', 'change_percentage', 'current_value', 'rank']

# manipulate data to find price changes and value of investments
df = spark.createDataFrame(dict_current) \
  .withColumn('no_shares', psf.col('initial_investment')/psf.col(f'{old_date}_price')) \
  .withColumn('current_value', psf.col('no_shares')*psf.col(f'{current_date}_price')) \
  .withColumn('change_percentage', (psf.try_subtract(f'{current_date}_price', f'{old_date}_price'))/psf.col(f'{old_date}_price')*100) \
  .withColumn("rank", psf.rank().over(max_window)) \
  .select(select_order)

def round_cols(df, cols):
  for col in cols:
    df = df.withColumn(col, psf.round(col, 2))
  return df
    
# round to 2 decimal place
df = round_cols(df, cols=[f'{current_date}_price',f'{old_date}_price','no_shares', 'current_value', 'change_percentage'])

# sum of all the current investments' value
sum_of_investments = df.groupBy().sum().collect()

df.show()

+--------------------+------+------------------+---------+----------------+----------------+-----------------+-------------+----+
|        company_name|symbol|initial_investment|no_shares|2022-02-07_price|2024-02-06_price|change_percentage|current_value|rank|
+--------------------+------+------------------+---------+----------------+----------------+-----------------+-------------+----+
|Eli Lilly and Com...|   LLY|             10000|    41.06|          243.55|          705.03|           189.48|     28948.06|   1|
|  NVIDIA Corporation|  NVDA|             10000|    40.44|          247.28|          682.23|           175.89|     27589.37|   2|
|       Broadcom Inc.|  AVGO|             10000|    17.02|          587.63|         1222.65|           108.06|     20806.46|   3|
|Meta Platforms In...|  META|             10000|    44.46|          224.91|          454.72|           102.18|     20217.86|   4|
|    Merck & Co. Inc.|   MRK|             10000|    128.9|           77.58|          126.8

In [60]:
greatest_relative_inrease = df.filter("rank==1").collect()[0]

initial_investment_total = sum_of_investments[0]['sum(initial_investment)']
current_investment_total = sum_of_investments[0]['sum(current_value)']

from pathlib import Path
Path("./outputs").mkdir(parents=True, exist_ok=True)

# write answers to txt file
with open('./outputs/results.txt', 'w') as f:
    print('Stock price comparison\n', f'Current date: {current_date}\n', f'Start date: {old_date}\n' , file=f)

with open('./outputs/results.txt', 'a') as f:
  print("Greatest relative increase: ", greatest_relative_inrease['company_name'], f"({greatest_relative_inrease['symbol']})",file=f)
  print("Growth %: ", greatest_relative_inrease['change_percentage'],file=f)
  print("Gross profit %: ", greatest_relative_inrease['current_value']-greatest_relative_inrease['initial_investment'], '\n',file=f)
  
  print("Initial total investment: ", initial_investment_total, file=f)
  print("Current total investment (growth): ", current_investment_total, file=f)

# save final datafram as csv
# I convert to pandas first because spark saves csv's in a folder format (like it does with .delta and .parquet)
pdf = df.toPandas()
pdf.to_csv('./outputs/polygon_stock_data.csv', sep=',', encoding='utf-8')

In [8]:
if (Path.cwd() / 'outputs' / 'polygon_stock_data.csv').exists():
    df_previous_output = spark.read.csv('./outputs/polygon_stock_data.csv', header=True, sep=',')
    furthest_date = df_previous_output.select(psf.max(''))
    df_previous_output.show()
    print('haha')

current_date = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
old_date = (datetime.today() - relativedelta(years=2)).strftime('%Y-%m-%d')

# since polygon free tier is only 5 requests per min, this method takes 40 mins to run
# dict_2019 = request_prices_polygon(stocks_dict=stocks_dict, date_period=old_date)
# dict_current = request_prices_polygon(stocks_dict=dict_2019, date_period=current_date)

+---+--------------------+------+------------------+---------+----------------+----------------+-----------------+-------------+----+
|_c0|        company_name|symbol|initial_investment|no_shares|2022-02-08_price|2024-02-07_price|change_percentage|current_value|rank|
+---+--------------------+------+------------------+---------+----------------+----------------+-----------------+-------------+----+
|  0|Eli Lilly and Com...|   LLY|             10000|    41.68|          239.91|          725.38|           202.36|      30235.5|   1|
|  1|  NVIDIA Corporation|  NVDA|             10000|    39.83|          251.08|          700.99|           179.19|     27918.99|   2|
|  2|Meta Platforms In...|  META|             10000|    45.42|          220.18|          469.59|           113.28|     21327.55|   3|
|  3|       Broadcom Inc.|  AVGO|             10000|    16.66|          600.22|         1257.06|           109.43|     20943.32|   4|
|  4|    Merck & Co. Inc.|   MRK|             10000|   130.02|

24/02/08 16:34:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , company_name, symbol, initial_investment, no_shares, 2022-02-08_price, 2024-02-07_price, change_percentage, current_value, rank
 Schema: _c0, company_name, symbol, initial_investment, no_shares, 2022-02-08_price, 2024-02-07_price, change_percentage, current_value, rank
Expected: _c0 but found: 
CSV file: file:///Users/jowen/Desktop/tasks/stock-analysis/outputs/polygon_stock_data.csv
