In [36]:
!pip install yfinance
!pip install python-dotenv
!pip install fmp_python
!pip install category_encoders



In [37]:
import os
from dotenv import load_dotenv
import typing
from fmp_python.fmp import FMP
import pandas as pd
import numpy as np
from urllib.request import urlopen
import json
import csv
import bs4 as bs
import requests
import category_encoders as ce
from datetime import datetime, timedelta
from pandas.tseries.holiday import USFederalHolidayCalendar
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [38]:
def fmpAPIKey(fileLocation):
  os.chdir(fileLocation)
  load_dotenv()
  apikey = os.environ.get("apikey")
  return apikey

In [39]:
def getSPYlist():
  resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
  soup = bs.BeautifulSoup(resp.text, 'lxml')
  table = soup.find('table', {'class': 'wikitable sortable'})
  tickers = []
  for row in table.findAll('tr')[1:]:
      ticker = row.findAll('td')[0].text
      ticker = ticker.replace("\n", "")
      ticker = ticker.replace(".", "-")
      tickers.append(ticker)
  return tickers

In [40]:
#!/usr/bin/env python
def get_jsonparsed_data(url):
    """
    Receive the content of ``url``, parse it as JSON and return the object.

    Parameters
    ----------
    url : str

    Returns
    -------
    dict
    """
    response = urlopen(url)
    data = response.read().decode("utf-8")
    return json.loads(data)

In [67]:
def getJsonData(dataType, ticker, apikey, period=None, startDate=None, endDate=None, limit=None):
  if dataType == 'ratio':
    url = ("https://fmpcloud.io/api/v3/ratios/"+ticker+"?period=quarter&limit="+limit+"&apikey="+apikey)
  elif dataType == 'profile':
    url = ("https://fmpcloud.io/api/v3/profile/"+ticker+"?apikey="+apikey)
  elif dataType == 'financial-growth':
    url = ("https://fmpcloud.io/api/v3/financial-growth/"+ticker+"?period=quarter&limit="+limit+"&apikey="+apikey)
  elif dataType == 'historical-price':
    url = ("https://fmpcloud.io/api/v3/historical-price-full/" + ticker + "?from=" + startDate + "&to=" + endDate + "&apikey="+apikey)
  jsonData = get_jsonparsed_data(url)
  # df = pd.DataFrame.from_dict(jsonData, orient='columns')
  return jsonData

In [68]:
def getPandasData(dataType, tickers, apikey, period=None, startDate=None, endDate=None, limit=None):
  pandasData = pd.DataFrame()
  failTickers = []
  for ticker in tickers:
    try:
      jsonData = getJsonData(dataType, ticker, apikey, period, startDate, endDate, limit)
      #price data is nested json. it can't extract directly from Json
      if dataType == 'historical-price':
        df = pd.DataFrame.from_dict(jsonData['historical'], orient='columns')
        df['symbol'] = jsonData['symbol']
      else:
        df = pd.DataFrame.from_dict(jsonData, orient='columns')
      pandasData = pd.concat([pandasData, df], axis=0)
    except:
      failTickers.append(ticker)
      pass
  return pandasData

In [69]:
def getForwarDate(pandasData):
  #Generate market closure date array
  cal = USFederalHolidayCalendar()
  holidays = cal.holidays(start='2000-01-01', end='2021-12-31')
  holidays = np.array(holidays, dtype='datetime64[D]')
  #Get the first busday 45 after the quater ends
  pandasData['forward_date'] = pandasData['date'].apply(lambda x: np.busday_offset(x, 45, roll='forward', holidays=holidays))
  pandasData['forward_date'] = pandasData['forward_date'].astype('str')

In [70]:
def getOutPerform(mergedPandas):
  mergedPandas = mergedPandas.sort_values(by=['symbol','date']).reset_index(drop=True)
  shifted = mergedPandas.shift(periods=-1, axis="rows", fill_value=0)
  mergedPandas['eq'] = mergedPandas['symbol'].eq(shifted['symbol'])
  subtract = mergedPandas[['spyP','tickerP']].subtract(shifted[['spyP','tickerP']])
  mergedPandas[['spyC','tickerC']] = subtract[['spyP','tickerP']] / shifted[['spyP','tickerP']]
  mergedPandas = mergedPandas[mergedPandas['eq'] == True]
  mergedPandas['diff'] = mergedPandas['tickerC'].subtract(mergedPandas['spyC']) 
  mergedPandas['outPerform'] = mergedPandas['diff'] >= 0.1
  return mergedPandas

In [None]:
apikey = fmpAPIKey("/content/drive/MyDrive/Colab Notebooks")

In [71]:
spyList = getSPYlist()
# spyList = spyList[:2]
ratio = getPandasData('ratio', spyList, apikey, limit="83")
profile = getPandasData('profile', spyList, apikey)
finincialGrowth = getPandasData('financial-growth', spyList, apikey, limit="83")
individualPrice = getPandasData('historical-price', spyList, apikey, startDate = "2000-01-01", endDate= "2021-09-25")
spyPrice = getPandasData('historical-price', "SPY", apikey, startDate = "2000-01-01", endDate= "2021-09-25")


In [72]:
getForwarDate(ratio)

In [73]:
#drop unnecessary columns and change names
ratio = ratio.drop(columns='period')
# finincialGrowth = finincialGrowth.drop(columns='period')
profile = profile[['symbol','industry','sector']]
individualPrice = individualPrice[['date','adjClose','symbol']]
individualPrice = individualPrice.rename(columns={"date":"forward_date", 'adjClose':'tickerP'})
spyPrice = spyPrice[['date','adjClose']]
spyPrice = spyPrice.rename(columns={"date":"forward_date", "adjClose": "spyP"})


In [74]:
#merge all data
spyKeyData = ratio.merge(profile, how='left', on='symbol')
spyKeyData = spyKeyData.merge(finincialGrowth, how='left', on=['symbol','date'])
spyKeyData = spyKeyData.merge(individualPrice, on=['symbol','forward_date'])
spyKeyData = spyKeyData.merge(spyPrice, on='forward_date')

In [50]:
#Drop if more than 10% of data is missing
tooManyMissing = pd.DataFrame(spyKeyData.isnull().sum() > spyKeyData.shape[0] /1000)
tooManyMissing = tooManyMissing[tooManyMissing[0] == True].index.tolist()

#Keep current/quick/cash Ratio
del tooManyMissing[:3]

spyKeyData = spyKeyData.drop(tooManyMissing, axis=1)
# spyKeyData = spyKeyData.dropna()

#Exclude 2021 2Q data
spyKeyData2Q = spyKeyData[spyKeyData.date == '2021-06-30'] 
spyKeyData = spyKeyData[spyKeyData.date != '2021-06-30']

In [51]:
spyKeyData = getOutPerform(spyKeyData)

In [52]:
y = spyKeyData['outPerform']
X = spyKeyData.drop(columns=['outPerform','symbol','date','forward_date','tickerP','spyP','tickerC','spyC','eq','diff'])
# X = spyKeyData

In [53]:
categorical_cols = ['industry','sector']
numerical_cols = list(set(X.columns.to_list()) - set(categorical_cols))

In [54]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [55]:
rf_stock_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=5)

In [56]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rf_stock_model)
                             ])

In [57]:
# spilt data
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# model score
my_pipeline.score(X_valid, y_valid)

0.810423951048951