# Sample Selection

### Loading Libraries

In [18]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Path
from pathlib import Path

In [19]:
idx = pd.IndexSlice

sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

deciles = np.arange(.1, 1, .1).round(1)

In [22]:
%matplotlib inline

### Loading Data

In [25]:
DATA_STORE = Path('..', 'data', 'assets.h5')

In [29]:
with pd.HDFStore(DATA_STORE) as store:
    data = (store['quandl/wiki/prices']
            .loc[idx['2007':'2016', :],
                 ['adj_open', 'adj_high', 'adj_low', 'adj_close', 'adj_volume']]
            .dropna()
            .swaplevel()
            .sort_index()
            .rename(columns=lambda x: x.replace('adj_', '')))
    metadata = store['us_equities/stocks'].loc[:, ['marketcap', 'sector']]

In [31]:
data.info(null_counts=True)

In [33]:
metadata.sector = pd.factorize(metadata.sector)[0]

metadata.info()

In [35]:
data = data.join(metadata).dropna(subset=['sector'])

In [37]:
data.info(null_counts=True)

In [39]:
print(f"# Tickers: {len(data.index.unique('ticker')):,.0f} | # Dates: {len(data.index.unique('date')):,.0f}")

### 500 Most-Traded Stocks Selection

In [55]:
dv = data.close.mul(data.volume)

In [57]:
top500 = (dv.groupby(level='date')
          .rank(ascending=False)
          .unstack('ticker')
          .dropna(thresh=8*252, axis=1)
          .mean()
          .nsmallest(500))

### Visualizing The 200 Most Liquid Stocks

In [115]:
top200 = (data.close
          .mul(data.volume)
          .unstack('ticker')
          .dropna(thresh=8*252, axis=1)
          .mean()
          .div(1e6)
          .nlargest(200))
cutoffs = [0, 50, 100, 150, 200]
fig, axes = plt.subplots(ncols=4, figsize=(20, 10), sharex=True)
axes = axes.flatten()

for i, cutoff in enumerate(cutoffs[1:], 1):
    top200.iloc[cutoffs[i-1]:cutoffs[i]
                ].sort_values().plot.barh(logx=True, ax=axes[i-1])

fig.tight_layout()
plt.show()

In [75]:
to_drop = data.index.unique('ticker').difference(top500.index)

In [77]:
len(to_drop)

In [79]:
data = data.drop(to_drop, level='ticker')

In [81]:
data.info(null_counts=True)

In [117]:
print(f"# Tickers: {len(data.index.unique('ticker')):,.0f} | # Dates: {len(data.index.unique('date')):,.0f}")

#### Removing Outlier Observations Based on Daily Returns

In [84]:
before = len(data)

data['ret'] = data.groupby('ticker').close.pct_change()
data = data[data.ret.between(-1, 1)].drop('ret', axis=1)

print(f'Dropped {before-len(data):,.0f}')

In [86]:
tickers = data.index.unique('ticker')

print(f"# Tickers: {len(tickers):,.0f} | # Dates: {len(data.index.unique('date')):,.0f}")

#### Sample Price Data for Illustration

In [89]:
ticker = 'AAPL'
# alternative
# ticker = np.random.choice(tickers)
price_sample = data.loc[idx[ticker, :], :].reset_index('ticker', drop=True)

In [91]:
price_sample.info()

In [93]:
price_sample.to_hdf('data.h5', 'data/sample')

#### Computing Returns

In [96]:
by_ticker = data.groupby(level='ticker')

#### Historical Returns

In [101]:
T = [1, 2, 3, 4, 5, 10, 21, 42, 63, 126, 252]

In [103]:
for t in T:
    data[f'ret_{t:02}'] = by_ticker.close.pct_change(t)

#### Forward Returns

In [106]:
data['ret_fwd'] = by_ticker.ret_01.shift(-1)

data = data.dropna(subset=['ret_fwd'])

### Persist Results

In [109]:
data.info(null_counts=True)

In [111]:
data.to_hdf('data.h5', 'data/top500')