We will use *pandas*, *numpy*, *statsmodel*, and *scipy* to hold our data and do various calculations.

In [None]:
import pandas
import numpy
import scipy.stats
from statsmodels.formula.api import ols

*requests_cache* lets us download Shiller's data from the web and cache the data locally so we're not downloading it **every** time we run this.
*xlrd* is an Excel engine that lets pandas parse the Excel file and build a DataFrame from it.

In [None]:
import requests_cache
import xlrd

We will use *matplotlib* to graph a few things.
Configure it to use a different default style.
*seaborn-poster* generates bigger (and more legible) charts.

In [None]:
from matplotlib import pyplot as plt
plt.style.use('seaborn-poster')

Other assorted modules we use.

In [None]:
import datetime

We use *requests_cache* to download Shiller's excel file from his website and cache it for three days.

We do a bit of data munging on the spreadsheet -- renaming columns and deleting a few unused columns.

In [None]:
def get_shiller(url="http://www.econ.yale.edu/~shiller/data/ie_data.xls"):
    expire_after = datetime.timedelta(days=3)
    session = requests_cache.CachedSession(cache_name='data-cache', backend='sqlite', expire_after=expire_after)

    excel = session.get(url, stream=True)
    # must be wrapped in a BytesIO for read_excel to accept it...
    df = pandas.read_excel(pandas.io.common.BytesIO(excel.content),
                           sheet_name='Data',
                           engine='xlrd',
                           skiprows=7,
                           skipfooter=11, # WARN: is this always 11?
                           index_col=0,
                           parse_dates=True)
    df.rename(columns={'P' : 'Price',
                      'D' : 'Dividend',
                      'E' : 'Earnings',
                      'Price' : 'Real Price',
                      'Dividend' : 'Real Dividend',
                      'Price.1' : 'Real Total Return Price',
                      'Earnings' : 'Real Earnings',
                      'Earnings.1' : 'Real Total Return Scaled Earnings'}, inplace=True)
    del(df['Unnamed: 13'], df['Unnamed: 15']) # garbage columns from Shiller's spreadsheet
    return df

shiller = get_shiller()
shiller.head()

Now we can generate some derived columns.
* Dividend yield (which use to calculate total return)
* month over month price increase
* month over month total return (which is just the price increase plus dividend yield)
* the month over month change in CPI (Shiller only provides absolute numbers)
* the real (inflation-adjusted) month over month total return
* and the average CAPE *up to a given month*. This is, only based on the months preceding the current one.

In [None]:
shiller['Div Yield'] = shiller['Dividend'] / shiller['Price'] / 12
shiller['MoM Price'] = shiller['Price'] / shiller['Price'].shift(1)
shiller['MoM TR'] = shiller['MoM Price'] + shiller['Div Yield']
shiller['CPI Change'] = shiller['CPI'] / shiller['CPI'].shift(1)
shiller['Real MoM TR'] = shiller['MoM TR'] / shiller['CPI Change'] - 1
shiller['Mean CAPE'] = shiller['CAPE'].expanding().mean()
shiller.head()

We also want to be able to calculate the actual realized real returns from a given point in time.
That is, what is the actual forward 1-year (or 3- or 5- or 10-year) return from a given month.

In [None]:
def calc_returns(df, years):
    months = years * 12
    rolling_df = df['Real MoM TR'].rolling(months)
    r = rolling_df.apply(lambda x: numpy.power(numpy.prod(x + 1), 1/years), raw=True) - 1
    df['Returns'] = r.shift(-years * 12)
    return df

In [None]:
model = ols("Returns ~ CAPE", calc_returns(shiller, 10)).fit()
model.summary()

In [None]:
for i in range(1, 51):
    model = ols("Returns ~ CAPE", calc_returns(shiller, i)).fit()
    print(i, model.rsquared)

In [None]:
ten_year = calc_returns(shiller, 10)
nanfree = ten_year.dropna()
numpy.polyfit(nanfree['CAPE'], nanfree['Returns'], 1)

In [None]:
def guess(intercept, slope, cape):
    return intercept + (slope * cape)

In [None]:
data=[]
for i in range(1, len(nanfree)):
    chunk = nanfree[:i]
    slope, intercept = numpy.polyfit(chunk['CAPE'], chunk['Returns'], 1)
    current = nanfree.iloc[i]
    prediction = guess(intercept, slope, current['CAPE'])
    data.append((current.name.date(), current['CAPE'], prediction, current['Returns']))
df_p = pandas.DataFrame.from_records(data, columns=['Date', 'CAPE', 'Expected', 'Actual'], index='Date')
#df_p.head()
df_p[["Expected", "Actual"]].plot()

In [None]:
nanfree['EY10'] = 1 / nanfree['CAPE']
nanfree[['EY10', 'Returns']].plot()

In [None]:
model = ols("Returns ~ EY10", nanfree['1985':]).fit()
model.summary()

In [None]:
post_1985 = nanfree['1985':][["EY10", "Returns"]]
time = 10 * 12
post_1985['EY10_delta'] = post_1985['EY10'] - post_1985['EY10'].shift(time)
post_1985['Returns_delta'] = post_1985['Returns'] - post_1985['Returns'].shift(time)
post_1985 = post_1985.dropna()
post_1985['EY10_delta'] = post_1985['EY10_delta'] < 0
post_1985['Returns_delta'] = post_1985['Returns_delta'] < 0
post_1985['SameDirection'] = ~(post_1985['EY10_delta'] ^ post_1985['Returns_delta'])
post_1985

In [None]:
len(post_1985[post_1985['SameDirection'] == True]) / len(post_1985)