In [1]:
import stock_data

import pandas
import matplotlib
import matplotlib.pyplot 

In [2]:
%matplotlib inline
matplotlib.style.use('ggplot')

### get data

In [3]:
data = stock_data.get_all_closing_prices("../stock_csvs/")

### retain just closing prices

In [4]:
closing_prices =  data.ix[:, data.columns.get_level_values(1).isin({"close"})]

In [5]:
closing_prices.ix[:5, :5]

Unnamed: 0_level_0,a,aa,aapl,abbv,abc
Unnamed: 0_level_1,close,close,close,close,close
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1998-01-02,,13.3511,3.95098,,6.50799
1998-01-05,,13.5853,3.8902,,6.40419
1998-01-06,,13.2817,4.60502,,6.28477
1998-01-07,,13.3042,4.24032,,6.34839
1998-01-08,,12.7533,4.39107,,6.38299


### get returns

In [6]:
returns = (closing_prices.shift(-1) - closing_prices) / closing_prices

### correlation matrix

In [7]:
corr_mat = returns.corr()

### for each stock, get most-correlated other stock

#### get most-correlated other stock for each stock

In [8]:
max_corrs = []
for col in corr_mat.columns:
    maximum = corr_mat[col[0]].sort_values(by='close', 
                                            ascending=False).ix[1]
    max_corrs.append((col[0],
                      maximum.name[0],
                      maximum.values[0]))

#### retain as DataFrame

In [9]:
max_corrs_df = pandas.DataFrame(max_corrs, 
                                columns=['stock', 'most correlated other stock', 'correlation'])

In [10]:
max_corrs_df.head()

Unnamed: 0,stock,most correlated other stock,correlation
0,a,tel,0.621594
1,aa,x,0.650853
2,aapl,tel,0.468218
3,abbv,jnj,0.459304
4,abc,sni,0.457664


### aggregate across 'most correlated other stock' to see if we have any repeat high-correlaters

#### count number of occurences of each most-correlated stock

In [None]:
most_corr = max_corrs_df[['stock', 
                                'most correlated other stock']].groupby('most correlated other stock').count()

#### check out top ten

In [None]:
most_corr.sort_values('stock', ascending=False).ix[:10, :]

### additional analysis

#### retain top ten

In [None]:
top_ten = most_corr.sort_values('stock', ascending=False).ix[:10, :]

#### check out each occurence's correlation for each top ten stock

In [None]:
top_ten.merge(max_corrs_df[['most correlated other stock', 'correlation']], 
                              left_index=True,
                              right_on='most correlated other stock')

#### average across all most-correlated occurrences for each top ten stock

In [None]:
top_ten.merge(max_corrs_df[
                 ['most correlated other stock', 'correlation']], 
             left_index=True,
             right_on='most correlated other stock')[
        ['most correlated other stock', 
        'correlation']].groupby('most correlated other stock').mean()

#### retain the above-described average correlations

In [None]:
top_ten_avg_corr = top_ten.merge(max_corrs_df[
                                     ['most correlated other stock', 'correlation']], 
                                 left_index=True,
                                 right_on='most correlated other stock')[
                            ['most correlated other stock', 
                            'correlation']].groupby('most correlated other stock').mean()

#### plot

In [None]:
plot = top_ten_avg_corr.plot(kind='bar',
                            color='Maroon',
                            figsize=(12,8))

### removing NaNs

#### drop all columns for which any row contains NaN

In [None]:
closing_no_nans = closing_prices.dropna(axis=1, how='any')

#### check results

In [None]:
closing_no_nans.shape

In [None]:
closing_no_nans.ix[:5, :5]