
# Stonks
## Note: Interactive Plotly figures wont show up when viewing at Github, but trust me, they're neat
## Import needed packages

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pandas_datareader.data as pdr
import yfinance as yf
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from tqdm.notebook import tqdm
# yf.pdr_override()


## Functions to get stock data and to plot share prices

In [2]:
def get_fundamentals(tickers):
    '''Gets the fundamentals data for given tickers and produces a clean dataframe from it'''
    
    tickers_data = {}
    fundamentals = ['forwardPE',
                    'forwardEps',
                    'sector',
                    'fullTimeEmployees',
                    'country',
                    'twoHundredDayAverage',
                    'averageDailyVolume10Day',
                    'trailingPE',
                    'marketCap',
                    'priceToSalesTrailing12Months',
                    'trailingEps',
                    'priceToBook',
                    'earningsQuarterlyGrowth',
                    'pegRatio']
    # Loop all tickers and get some interesting fundamentals.
    for ticker in tqdm(tickers):
        ticker_object = yf.Ticker(ticker)

        #convert info() output from dictionary to dataframe
        new_info = { key:value for (key,value) in ticker_object.info.items() if key in fundamentals}
        temp = pd.DataFrame.from_dict(new_info, orient="index")
        temp.reset_index(inplace=True)
        if len(temp.columns) == 2:
            temp.columns = ["Attribute", "Value"]
            # add (ticker, dataframe) to main dictionary
            tickers_data[ticker] = temp


    combined_data = pd.concat(tickers_data).reset_index().drop(columns="level_1").rename(columns={'level_0': 'Ticker'})
    combined_data = combined_data.pivot(index='Ticker', columns='Attribute', values='Value').reset_index()
    combined_data = combined_data.rename_axis(None, axis=1).infer_objects()
    combined_data.dropna(inplace=True) # Drop if any fundamental is NA
    return combined_data

In [3]:
def get_data(mode="test"):
    '''Fetches stock tickers and fundamentals data from Yahoo or csv'''
    if mode == "test":
        # Tickers for lighter computing
        tickers =['FB','AMZN', 'AAPL', 'NFLX', 'GOOGL', 'MSFT']
        fundamentals = get_fundamentals(tickers)
    elif mode == "all":
        #Get all tickers from csv, if no csv in directory -> scrape them from wikipedia
        SP500_fileName = "SP500_symbols.csv"
        if not os.path.isfile(SP500_fileName):
            tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
            tickers = tickers[0]["Symbol"]
            tickers.to_csv(SP500_fileName)
        else:
            tickers = pd.read_csv(SP500_fileName).drop(['Unnamed: 0'],axis=1)

        # Get all fundamentals from csv, if no csv in directory -> scrape them from yahoo
        fundamentals_fileName = "SP500_fundamentals.csv"
        if not os.path.isfile(fundamentals_fileName):
            fundamentals = get_fundamentals(tickers)
            fundamentals.to_csv(fundamentals_fileName)
        else:
            fundamentals = pd.read_csv(fundamentals_fileName).drop(['Unnamed: 0'],axis=1)
    else:
        print("Select mode")
        return 0

    return tickers,fundamentals[fundamentals["Ticker"] != "UDR"] # Remove UDR from data as a huge outlier


In [4]:
def monitor_stock(stock_df,stockName,github_plots=True):
    '''Creates an interactive Plotly figure to monitor the share prices and volumes of given stocks'''
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
               vertical_spacing=0.03, 
               row_width=[0.2, 0.7])


    fig.add_trace(go.Candlestick(x = stock_df.index, 
                                                   open = stock_df[('Open',    stockName)], 
                                                   high = stock_df[('High',    stockName)], 
                                                   low = stock_df[('Low',    stockName)], 
                                                   close = stock_df[('Close',    stockName)],showlegend=False,name="Price"))
    fig.update_xaxes(row=1, col=1,
        title_text = '',
        rangeslider_visible = False,
        rangeselector = dict(
            buttons = list([
                dict(count = 1, label = '1M', step = 'month', stepmode = 'backward'),
                dict(count = 6, label = '6M', step = 'month', stepmode = 'backward'),
                dict(count = 1, label = 'YTD', step = 'year', stepmode = 'todate'),
                dict(count = 1, label = '1Y', step = 'year', stepmode = 'backward'),
                dict(step = 'all')])))
    
    fig.add_trace(go.Bar(x = stock_df.index, y=stock_df[('Volume',    stockName)], showlegend=False,name="Volume",marker=dict(color="rgba(0,0,0.8,0.66)")),row=2, col=1)

 
    
    fig.update_layout(
        width=1280,
        height=800,
        title = {
            'text': stockName +' STOCK MONITOR',
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            plot_bgcolor =  "rgba(1,1,1,0.05)")
    
    fig.update_yaxes(title_text ='Close Price', tickprefix = '$',row=1,col=1)
    fig.update_yaxes(title_text = 'Volume',row=2,col=1)
    fig.show()

## Monitor one example stock

In [5]:
start = dt.datetime(2020,1,1)
end = dt.datetime.now()
stocks = pdr.DataReader(['FB'], 'yahoo', start, end)
stocks.describe()

Attributes,Adj Close,Close,High,Low,Open,Volume
Symbols,FB,FB,FB,FB,FB,FB
count,334.0,334.0,334.0,334.0,334.0,334.0
mean,245.207575,245.207575,248.777486,241.501527,245.086467,21903180.0
std,39.613106,39.613106,39.938342,39.638959,39.816334,9626736.0
min,146.009995,146.009995,148.179993,137.100006,139.75,6702000.0
25%,217.792496,217.792496,219.047504,213.990002,216.775002,15437550.0
50%,257.229996,257.229996,263.199997,254.835007,258.684998,19553500.0
75%,273.682503,273.682503,277.805008,269.539993,274.2425,25171700.0
max,325.059998,325.059998,331.799988,321.609985,330.119995,76343900.0


In [6]:
monitor_stock(stocks,"FB")

## Get fundamentals data for companies

In [7]:
# tickers,fundamentals = get_data("test")
tickers,fundamentals = get_data("all")
# fundamentals = pd.read_csv("SP500_fundamentals.csv").drop(['Unnamed: 0'],axis=1)
# fundamentals.columns
fundamentals


Unnamed: 0,Ticker,averageDailyVolume10Day,country,earningsQuarterlyGrowth,forwardEps,forwardPE,fullTimeEmployees,marketCap,pegRatio,priceToBook,priceToSalesTrailing12Months,sector,trailingEps,trailingPE,twoHundredDayAverage
0,A,1396414.0,United States,0.462,4.36,30.917430,16400.0,4.107329e+10,2.91,8.555471,7.427358,Healthcare,2.597,51.906048,119.774414
1,AAP,920185.0,United States,0.168,11.66,16.949400,40000.0,1.289753e+10,1.57,3.654468,1.276185,Consumer Cyclical,7.140,27.679274,164.982210
2,AAPL,79115057.0,United States,0.293,4.72,28.300850,147000.0,2.242554e+12,2.00,33.938007,7.624235,Technology,3.687,36.229996,125.668380
3,ABBV,4930885.0,United States,-0.987,13.87,8.069935,47000.0,1.975374e+11,2.15,15.109342,4.312667,Healthcare,2.720,41.150734,103.048010
4,ABMD,231200.0,United States,-0.106,4.91,71.521390,1536.0,1.588377e+10,4.02,12.589898,19.538795,Healthcare,4.387,80.047870,304.002440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
356,XLNX,2203175.0,United States,0.055,3.30,39.854546,4891.0,3.232223e+10,5.20,12.408718,10.587797,Technology,2.499,52.629050,129.577700
357,XYL,849575.0,United States,0.254,3.09,36.006474,15600.0,2.002702e+10,2.34,6.760649,4.107264,Industrials,1.400,79.471430,96.721260
358,ZBRA,271150.0,United States,0.178,16.95,29.297934,8800.0,2.656775e+10,3.02,12.383114,5.972966,Technology,9.350,53.112297,381.658600
359,ZION,1163275.0,United States,22.000,4.33,12.759815,9682.0,9.049950e+09,-0.32,1.228433,3.232125,Financial Services,4.919,11.231957,43.150295


## Perform PCA and other analysis

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import random

## Correlation matrix of fundamentals

In [9]:
num_features = fundamentals.select_dtypes(include=np.number).columns.tolist()
num_fundamentals = fundamentals.loc[:, num_features]
num_fundamentals.corr()



Unnamed: 0,averageDailyVolume10Day,earningsQuarterlyGrowth,forwardEps,forwardPE,fullTimeEmployees,marketCap,pegRatio,priceToBook,priceToSalesTrailing12Months,trailingEps,trailingPE,twoHundredDayAverage
averageDailyVolume10Day,1.0,0.009084,-0.075786,-0.068527,0.174742,0.513959,-0.012699,-0.018946,-0.005038,-0.091094,0.057273,-0.089485
earningsQuarterlyGrowth,0.009084,1.0,-0.014637,0.008142,-0.017391,-0.009665,-0.021016,-0.002215,-0.023059,-0.004212,0.003837,-0.014515
forwardEps,-0.075786,-0.014637,1.0,-0.071815,0.128663,0.20432,-0.001921,0.038919,-0.036922,0.895018,-0.037468,0.856449
forwardPE,-0.068527,0.008142,-0.071815,1.0,-0.067595,0.078474,-0.048078,0.081142,0.715427,-0.039899,0.346244,0.193719
fullTimeEmployees,0.174742,-0.017391,0.128663,-0.067595,1.0,0.468357,-0.006676,0.101863,-0.188678,0.099886,0.060663,0.337921
marketCap,0.513959,-0.009665,0.20432,0.078474,0.468357,1.0,-0.002847,0.050552,0.141394,0.189407,0.053921,0.405702
pegRatio,-0.012699,-0.021016,-0.001921,-0.048078,-0.006676,-0.002847,1.0,-0.00104,0.08796,0.005921,-0.002542,0.013535
priceToBook,-0.018946,-0.002215,0.038919,0.081142,0.101863,0.050552,-0.00104,1.0,0.130314,0.031017,0.047617,0.098614
priceToSalesTrailing12Months,-0.005038,-0.023059,-0.036922,0.715427,-0.188678,0.141394,0.08796,0.130314,1.0,-0.036564,0.27655,0.181891
trailingEps,-0.091094,-0.004212,0.895018,-0.039899,0.099886,0.189407,0.005921,0.031017,-0.036564,1.0,-0.093909,0.783951


## Scatter matrix of numerical fundamentals

In [10]:
# fig = px.scatter_matrix(fundamentals,
#     dimensions=fundamentals.select_dtypes(include=np.number).columns.tolist(),
#     color="sector")
# fig.update_layout(width=1280,
#                     height=800)
# fig.update_traces(diagonal_visible=False)
# fig.show()

## PCA

In [11]:
def pca_on_fundamentals(data):
    '''Performs PCA on the numeric values of the fundamentals dataset'''
    features = data.select_dtypes(include=np.number).columns.tolist()
    x = data.loc[:, features].values
    x = StandardScaler().fit_transform(x)
    pd.DataFrame(data = x, columns = features).head()
    pca = PCA(n_components=2)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])
    # print("Explained variance ratios: ",pca.explained_variance_ratio_)
    return principalDf

def plot_pca(data):
    '''Plots the PCA onto two dimensions using interactive Plotly scatterplot'''
    principalDf = pca_on_fundamentals(data)
    rand_colours = color = [
        "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)])
        for i in range(len(pd.unique(data['sector'])))
    ]
    colours = {i:rand_colours[k] for k,i in enumerate(pd.unique(data['sector']))}

    col_df = pd.DataFrame.from_dict(colours,orient='index').reset_index()
    col_df.columns = ["sector", "Colour"]
    t = pd.merge(data,col_df,how="left")

    value_for_colour = 'forwardPE'
    fig = go.Figure(go.Scatter(
        x=principalDf["PC1"],
        y=principalDf["PC2"],
        mode='markers',
        text=t["Ticker"],
        marker_colorbar=dict(thickness=10,title=value_for_colour),
        marker_color=t[value_for_colour]
        ))

    fig.update_layout(
        width=1280,
        height=800,
        title = {
            'text': 'PCA of tickers',
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
            plot_bgcolor =  "rgba(1,1,1,0.05)")

    fig.show()

In [12]:
# pca = pca_on_fundamentals(fundamentals)

plot_pca(fundamentals)

PCA's first two dimensions explain roughly 40% of the variation in the data. We should look if we could use Tensorflow to analyse the data, as the data could have highly nonlinear and complex structure.