In [1]:
import pandas as pd
import numpy as np
import yfinance as yf

In [2]:
class correlationDataHandler:
    """
    A data handler for the correlations
    """
    
    def __init__(self, ticker_list: list):
        self.ticker_list = ticker_list
        
    def build_frame(self, period) -> pd.DataFrame:
        """
        Get the values of interest from the list of securities from the user
        """
        
        results = pd.DataFrame()
        
        for ticker in self.ticker_list:
            df = yf.Ticker(ticker).history(period = period, interval = "1d")
            # volume-weighted average price
            df['vwap'] = (df["Volume"] * df["Close"]) / df["Volume"]
            df['returns'] = df["Close"] / df.Close.shift(1) - 1
            df.insert(0, "Ticker", ticker)
            results = pd.concat([results, df])
            
        return results
    
    def clean_frame(self, period) -> pd.DataFrame:
        """
        Clean up the data 
        """
        
        df = self.build_frame(period)
        
        df = df.loc[:, ["Ticker", "vwap"]]
        
        df.dropna(axis = 0, how = "all", inplace = True)
        
        df["date"] = pd.to_datetime(df.index, utc = True).date
        
        df.reset_index(drop = False, inplace = True)
        
        df = pd.pivot(df, index = "date", columns = "Ticker", values = "vwap")
        
        return df
    
    def period_correlations(self, period) -> pd.DataFrame:
        """
        Get the correlations for a given time period
        """
    
        df = self.clean_frame(period)
        
        df = df.rename_axis(None).rename_axis(None, axis=1).corr()
        df = df.stack().reset_index()
        df = pd.DataFrame(df)
        df.columns = ['source', 'target', 'Correlation']
        df = df[df['Correlation'] != 1]
        
        df = df.drop_duplicates()

        df["oneway"] = df.apply(lambda x: not df[
            (df["source"] == x["target"]) & (df["target"] == x["source"]) & (df.index != x.name)].empty,axis=1)

        df = df[df['oneway'] == True].drop("oneway", axis = 1)

        # normalize correlation
        df["norm_corr"] = ((df["Correlation"] - df["Correlation"].mean()) / df["Correlation"].std()).round(3)
        
        df = df.groupby(['source'], as_index = False)['Correlation'].mean().round(3).rename(columns={'source':'name'})

        df = df.drop_duplicates()
        
        df["period"] = period
        
        return df
    
    def get_correlations(self) -> pd.DataFrame:
        """
        loop through all time period of interest and get the correlations into one usable dataframe
        """
        
        periods = ["1mo", "3mo", "6mo", "1y", "2y"]
        results = pd.DataFrame()
        
        for period in periods:
            df = self.period_correlations(period)
            
            results = pd.concat([results, df])
            
        results.reset_index(drop = True)
        
        return results
    
    def get_securities(self) -> list:
        """
        shows the current securities list
        """
        
        return self.ticker_list
    
    def remove_security(self, security_to_remove) -> list:
        """
        removes a security to the list
        """
        
        if security_to_remove in self.ticker_list:
            self.ticker_list.remove(security_to_remove)
            return self.ticker_list
        else:
            return print("The Security isn't in the current list")     
    
    def add_security(self, security_to_add) -> list:
        """
        adds a security to the list
        """
        
        if security_to_add in self.ticker_list:
            return print("The Security is already in the current list")
        else:
            self.ticker_list.append(security_to_add)
            return self.ticker_list
            

In [4]:
all_tickers = ['SSSS', 'GBDC', 'POLY.L', 'SKF.AX', 'PSDN', 'VRNOF', 'LFMD', 'CLRB', 'GLATF', 'URNM', 'AAPL', 'GOOGL', 'CVX']
ticks = correlationDataHandler(all_tickers)
check = ticks.get_correlations()
check

POLY.L: No data found, symbol may be delisted
SKF.AX: No data found, symbol may be delisted
PSDN: No price data found, symbol may be delisted (period=1mo)
POLY.L: No data found, symbol may be delisted
SKF.AX: No data found, symbol may be delisted
PSDN: No price data found, symbol may be delisted (period=3mo)
POLY.L: No data found, symbol may be delisted
SKF.AX: No data found, symbol may be delisted
PSDN: No price data found, symbol may be delisted (period=6mo)
POLY.L: No data found, symbol may be delisted
SKF.AX: No data found, symbol may be delisted
POLY.L: No data found, symbol may be delisted
SKF.AX: No data found, symbol may be delisted


Unnamed: 0,name,Correlation,period
0,AAPL,-0.047,1mo
1,CLRB,0.113,1mo
2,CVX,0.04,1mo
3,GBDC,-0.103,1mo
4,GLATF,0.022,1mo
5,GOOGL,0.056,1mo
6,LFMD,-0.142,1mo
7,SSSS,0.064,1mo
8,URNM,0.149,1mo
9,VRNOF,0.073,1mo


In [5]:
print(ticks.get_securities())
print(ticks.remove_security("SSSS"))
print(ticks.get_securities())
print(ticks.add_security("SSSS"))
print(ticks.get_securities())

['SSSS', 'GBDC', 'POLY.L', 'SKF.AX', 'PSDN', 'VRNOF', 'LFMD', 'CLRB', 'GLATF', 'URNM', 'AAPL', 'GOOGL', 'CVX']
['GBDC', 'POLY.L', 'SKF.AX', 'PSDN', 'VRNOF', 'LFMD', 'CLRB', 'GLATF', 'URNM', 'AAPL', 'GOOGL', 'CVX']
['GBDC', 'POLY.L', 'SKF.AX', 'PSDN', 'VRNOF', 'LFMD', 'CLRB', 'GLATF', 'URNM', 'AAPL', 'GOOGL', 'CVX']
['GBDC', 'POLY.L', 'SKF.AX', 'PSDN', 'VRNOF', 'LFMD', 'CLRB', 'GLATF', 'URNM', 'AAPL', 'GOOGL', 'CVX', 'SSSS']
['GBDC', 'POLY.L', 'SKF.AX', 'PSDN', 'VRNOF', 'LFMD', 'CLRB', 'GLATF', 'URNM', 'AAPL', 'GOOGL', 'CVX', 'SSSS']


### Next Steps
Xavier, this is excellent work! This is super slick and well written, I am really excited to use this to pull in pricing data. It is much cooler than reading in csvs.

I am sure you have a few next steps in mind, so feel free to follow up on those ideas. Additionally, there are two items that'd be really helpful:

1. **store as JSON**: I think it would likely be easiest to integrate this code with the React app / JavaScript code if we are returning the same object type that we're currently creating by reading in two .csvs. So ideally, this would create a set of {nodes, links} separately for each time period.

2. **create 'links'**: this goes hand-in-hand with step one, but the idea is that along with the average correlation per security, it'd be create to have the pairwise values.

3. **data sets as class properties**: I think it'd be best to save each of the data sets as it's own class property (so for example, we'd have: ticks.3mos, ticks.6mos, ticks.12mos, etc.), and then have each property update when a security is added/removed. Check out class properties and spend a few minutes playing around with this.

This is great, thank you for your efforts! PS, please create a separate folder for these files and move them there. The 'test code' is meant to be a 'junk section' in disguise and this code is certainly not junk!