# Statistical Inference of Stock Returns with Linear Regression

### Loading Libraries

In [2]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# SciPy
from scipy.stats import norm, pearsonr, spearmanr

# StatsModels
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.api import OLS, add_constant, graphics

# Warnings
import warnings

In [3]:
idx = pd.IndexSlice

sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

### Loading Data

In [10]:
with pd.HDFStore('data.h5') as store:
    data = (store['model_data']
            .dropna()
            .drop(['open', 'close', 'low', 'high'], axis=1))

#### Select Investment Universe

In [11]:
data = data[data.dollar_vol_rank<100]

In [12]:
data.info(null_counts=True)

### Creating Model Data

In [13]:
y = data.filter(like='target')
X = data.drop(y.columns, axis=1)
X = X.drop(['dollar_vol', 'dollar_vol_rank', 'volume', 'consumer_durables'], axis=1)

### Data Exploration

In [14]:
sns.clustermap(y.corr(), cmap=sns.diverging_palette(h_neg=20, h_pos=220), center=0, annot=True, fmt='.2%');
plt.show()

In [15]:
sns.clustermap(X.corr(), cmap=sns.diverging_palette(h_neg=20, h_pos=220), center=0);
plt.gcf().set_size_inches((14, 14))
plt.show()

In [16]:
corr_mat = X.corr().stack().reset_index()
corr_mat.columns=['var1', 'var2', 'corr']

corr_mat = corr_mat[corr_mat.var1!=corr_mat.var2].sort_values(by='corr', ascending=False)

In [17]:
corr_mat.head().append(corr_mat.tail())

In [18]:
y.boxplot();
plt.show()

### Linear Regression for Statistical Inference: OLS with `statsmodels`

#### Ticker-wise Standardization

In [19]:
sectors = X.iloc[:, -10:]

X = (X.drop(sectors.columns, axis=1)
     .groupby(level='ticker')
     .transform(lambda x: (x - x.mean()) / x.std())
    .join(sectors)
    .fillna(0))

#### 1-Day Returns

In [20]:
target = 'target_1d'
model = OLS(endog=y[target], exog=add_constant(X))
trained_model = model.fit()

print(trained_model.summary())

### 5-Day Returns

In [21]:
target = 'target_5d'
model = OLS(endog=y[target], exog=add_constant(X))
trained_model = model.fit()

print(trained_model.summary())

#### Obtaining The Residuals

In [22]:
preds = trained_model.predict(add_constant(X))

residuals = y[target] - preds

In [23]:
fig, axes = plt.subplots(ncols=2, figsize=(14,4))
sns.distplot(residuals, fit=norm, ax=axes[0], axlabel='Residuals', label='Residuals')
axes[0].set_title('Residual Distribution')
axes[0].legend()
plot_acf(residuals, lags=10, zero=False, ax=axes[1], title='Residual Autocorrelation')
axes[1].set_xlabel('Lags')
sns.despine()
fig.tight_layout();
plt.show()

#### 10-Day Returns

In [24]:
target = 'target_10d'
model = OLS(endog=y[target], exog=add_constant(X))
trained_model = model.fit()

print(trained_model.summary())

### Monthly Returns

In [25]:
target = 'target_21d'
model = OLS(endog=y[target], exog=add_constant(X))
trained_model = model.fit()

print(trained_model.summary())