In [None]:
from datetime import datetime
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import pandas as pd
import numpy as np
import itertools
import pickle
import seaborn
import json  
import os

In [None]:
seaborn.set()
SAVE_FIGS = False

# Load Model-Dataset Data

In [None]:
# load model-data results
raw = pd.read_pickle(r'Output\combination_results.pkl')
raw['Base_model'] = raw.apply(lambda x: x['Model'].split('_')[0], axis=1)
raw

# Coverage vs Correlation

In [None]:
# Plot all model-data combination as corr vs coverage
plt.scatter(raw['Coverage'],raw['Avg-R'])
plt.ylabel('Correlation')
plt.xlabel('Coverage')
if SAVE_FIGS: plt.savefig('Figures\Summary\All_Corr_Cov.png')
plt.show()

In [None]:
# Plot effect of model parameter as corr vs coverage
data = raw[raw['Data'] == 'Daily']
AgglomerativeClustering = data[data['Base_model'] == 'AgglomerativeClustering'][['Coverage','Avg-R']]
DBSCAN = data[data['Base_model'] == 'DBSCAN'][['Coverage','Avg-R']]
KMeans = data[data['Base_model'] == 'KMeans'][['Coverage','Avg-R']]
AffinityPropagation = data[data['Base_model'] == 'AffinityPropagation'][['Coverage','Avg-R']]

plt.plot(DBSCAN['Coverage'],DBSCAN['Avg-R'])
plt.plot(AgglomerativeClustering['Coverage'],AgglomerativeClustering['Avg-R'])
plt.plot(KMeans['Coverage'],KMeans['Avg-R'])
plt.scatter(AffinityPropagation['Coverage'],AffinityPropagation['Avg-R'])
plt.legend(['DBSCAN','AgglomerativeClustering','KMeans','AffinityPropagation'])
plt.ylabel('Correlation')
plt.xlabel('Coverage')
if SAVE_FIGS: plt.savefig('Figures\Summary\Model_Corr_Cov.png')
plt.show()

# Model Analysis

In [None]:
# Plot all combination according to base model
colors = {'AgglomerativeClustering':'green', 'DBSCAN':'blue','AffinityPropagation':'red','KMeans':'purple'}

legend_elements = [ Line2D([0], [0], marker='o', color='w', label='Scatter', markerfacecolor=color, markersize=10) for color in colors.values() ]
        
plt.scatter(raw['Coverage'],raw['Avg-R'], c=[colors[model] for model in raw['Base_model']])
plt.ylabel('Correlation')
plt.xlabel('Coverage')
plt.legend(legend_elements,colors.keys())
if SAVE_FIGS: plt.savefig('Figures\Summary\All_models_Corr_Cov.png')
plt.show()

# Data Analysis - Best Correlation Per Data Set

In [None]:
# get best model for each data set
maxs = raw[raw['Coverage'] > .5][['Data','Avg-R']]
maxs = maxs.groupby('Data').agg('max')
bests = pd.merge(raw,maxs,on=['Avg-R','Data'])
bests = bests.sort_values('Avg-R', ascending = False)[['Avg-R','Coverage','Data','Model']]
bests

# Load all clusters from all combinations

In [None]:
# load all clusters
all_clusters = pd.DataFrame(columns=['Corr','Cluster'])
for subdir, dirs, files in os.walk('Output\Correlations'):
    for file in files:

        # for all files open both correlation and cluster, create and stack df
        with open('Output\Correlations\\'+file, 'r') as fp:
            correlations = json.load(fp)
        with open('Output\Clusters\\'+file, 'r') as fp:
            clusters = json.load(fp)

        clusters_df = pd.DataFrame.from_dict(correlations, orient='index')
        clusters_df = clusters_df.reset_index().rename(columns= {'index':'key',0:'Corr'})
        clusters_df['Cluster'] = clusters_df.apply(lambda x: clusters[x['key']],axis=1)
        clusters_df['Cluster_len'] = clusters_df.apply(lambda x: len(x['Cluster']), axis = 1)
        clusters_df['File'] = file
        clusters_df = clusters_df.drop('key', axis = 1)
        all_clusters = all_clusters.append(clusters_df)

all_clusters
    

# Notable clusters

In [None]:
# Get highly correlated cluster from all models and data
notable_clusters = all_clusters[all_clusters.Cluster_len > 3]
notable_clusters = notable_clusters[notable_clusters.Corr > .95]
notable_clusters

In [None]:
# [ALK, DAL, LUV, UAL]	all airlines - with out using GICS
# [CCL, MGM, NCLH, RCL] cruises and resorts

# Best Model-Data Analysis

In [None]:
# Get top cluster for top Model-Data combo
notable_clusters = all_clusters[all_clusters.File == 'AgglomerativeClustering_250_Daily+Weekly.json']
notable_clusters = notable_clusters[notable_clusters.Corr > .95]
notable_clusters = notable_clusters[notable_clusters.Cluster_len >= 3] # for nice visualizations
notable_clusters

In [None]:
# Get time series data for clusters
ClustersVis = list(notable_clusters['Cluster'])
start = datetime(2019,11,1)
end = datetime(2020,8,1)
results = [yf.download(ClusterVis, start= start, end= end ,interval = '1d',prepost = True,threads = False)['Close'] for ClusterVis in ClustersVis]

In [None]:
# Print Clusters time series data
for result in results:
    plt.title(', '.join(result.columns) + ' Daily Close')    
    plt.plot(result)
    plt.xticks(rotation=25)
    plt.legend(result.columns)
    if SAVE_FIGS: plt.savefig('Figures\Top_Model\\' + '_'.join(result.columns) + '_DailyClose.png')
    plt.show()

In [None]:
# view GISC of clusters
GICS = pd.read_csv('Data\GICS-wiki.csv',encoding='ANSI').set_index('Stock', drop =True)
for result in results:
    print(GICS[GICS.index.isin(result.columns)][['GICS Sector','GICS Sub Industry']])