## Team Members:
#### Binlin Chi  
#### Hanyuan Hu

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = 20, 10

In [2]:
from sklearn.naive_bayes import *
from sklearn.ensemble import *

Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


In [3]:
from naive_bayes_classifier.load_data import str2word_bag
from naive_bayes_classifier.configure import *
from back_test import *
from utils import *

In [4]:
import pickle
import pandas as pd
import numpy as np
import datetime as dt

## 1. Accquiring Speeches From the Fed's Website

Using web crawler, can be found in *data_crawler.py*

In [5]:
rate_se, rst_lst = load_data()

rst_lst[:1]

## 2. Five-Year Real Treasury Yield (from 2004 til Now)  

def display_impactful_speech(rst_lst_=rst_lst, rates = rate_se, time_window=10, show_ratio=0.2):
    speech_dates = pd.to_datetime([rst_lst[i][1] for i in range(len(rst_lst))])
    speech_dates = pd.Series(speech_dates)
    speech_dates.sort_values(inplace=True)
    speech_dates = speech_dates.reset_index(drop=True).reindex(index=speech_dates)
    
    vol_after_speech = speech_dates.copy()
    for date in speech_dates.index:
        vol = np.std(rates[date:date+dt.timedelta(days=time_window)])
        vol_after_speech[date] = vol
        
    vol_after_speech = vol_after_speech.sort_values(ascending=False)
    dates_to_display = vol_after_speech[:int(len(vol_after_speech)*show_ratio)].index
    plt.scatter(dates_to_display, rates[dates_to_display], color='red', marker='^')
    
    list_y = []
    for date in dates_to_display:
        list_y.append(min(rate_se)-0.2)
    plt.scatter(dates_to_display, list_y, color='red', marker='^')

rate_se.plot();
display_impactful_speech(show_ratio=0.2)

## 3. Overall Reval Profile 

Reval/preval analysis is a widely used tools in market impact analysis. It is defined as the price movement before and after a certain incident happens in the market. In this case study, we construct revals of 5-year treasury yield movement before and after Fed speeches. 

In [6]:
rev_lst = get_reval(rst_lst, rate_se)

  tt = np.log(np.hstack((prev, rev)) / this)
  tt = np.log(np.hstack((prev, rev)) / this)
  tt = np.log(np.hstack((prev, rev)) / this)
  tt = np.log(np.hstack((prev, rev)) / this)


In [7]:
rev_ar = np.vstack(rev_lst)

plt.plot(rev_ar.T);

As expected, the revals for most of the speeches are quite flat, while on the other hand there are several speeches that have long lasting market impact.

## 4. A Quick Look at the Important Speeches

most_impact_article_lst = [rst_lst[i][:2] for i in np.argsort(-rev_ar[:, 12])[:10]]

most_impact_article_lst

Not surprisingly, the speech before the rate cut in August, is very important.

## 5. Feature Construction 

Here we would like to first use word bag as representation of each of the article. We also notice the titles may also be important so we should think about a method to generate features from them as well. However, intuitively, it might not be the best idea of treating the words in title and those in the passage alike.   
Here we construct features in title and passage seperately and merge the embedded vector together.

In [99]:
X_df = pd.DataFrame([str2word_bag(itm[2], STOP_CHARS, STOP_WORDS, to_lower=True) for itm in rst_lst]).fillna(0)
X = X_df.values

In [100]:
X_title = pd.DataFrame([str2word_bag(itm[1], STOP_CHARS, STOP_WORDS, to_lower=True) for itm in rst_lst]).fillna(0).values

## 6. Trading Strategy Assumptions

1. We have tradable securities for this predictive signal. 
2. We use the rate for backtesting. Here we assume the duration of such securities meerly change in the testing period (several days)

## 7. Systematic Parameter Search and Back Testing

We understand different combination of the hyperparameter, like how to label the sample, might have diverged result to the trading strategy. Here we use a systematic grid search algo to find some reasonable parameter sets by out of sample returns.

Parameters we would like to test on including: learning_algorithm, training_set_length, target_lag, quantile_for_label

Details of this algorithm can be found in *back_test_utils.py*

Results are listed here

with open("param_results.pkl", "rb") as fp:
    param_rst_lst = pickle.load(fp)

with open("params_str.pkl", "rb") as fp:
    params_lst = pickle.load(fp)

len(param_rst_lst)

for idx, prd_ar in param_rst_lst:
    plt.plot(np.cumsum((rev_ar[:, 11 + 1] - rev_ar[:, 10]) * prd_ar))

## Best Parameters Sets

rtn_lst = []

for idx, prd_ar in param_rst_lst:
    rtn_lst.append((idx, np.cumsum((rev_ar[:, 11 + 1] - rev_ar[:, 10]) * prd_ar)[-1]))

rtn_lst.sort(key=lambda x: x[1], reverse=True)

rtn_lst[:5]

[params for idx, params in enumerate(params_lst) if idx in [i[0] for i in rtn_lst[:5]]]

Counter intuitively, the titles are more noise than helpful features and the Random Forest models are uniformly better.

y_prd = param_rst_lst[20][1]

rtn_shift = 1

plt.plot(np.cumsum((rev_ar[:, 9 + 1 + rtn_shift] - rev_ar[:, 9 + rtn_shift]) * y_prd));

rtn_shift = 2

plt.plot(np.cumsum((rev_ar[:, 9 + 1 + rtn_shift] - rev_ar[:, 9 + rtn_shift]) * y_prd));

rtn_shift = 3

plt.plot(np.cumsum((rev_ar[:, 9 + 1 + rtn_shift] - rev_ar[:, 9 + rtn_shift]) * y_prd));

rtn_shift = 4

plt.plot(np.cumsum((rev_ar[:, 9 + 1 + rtn_shift] - rev_ar[:, 9 + rtn_shift]) * y_prd));

## 8. Alpha Decay

mean_decay = get_alpha_decay(rev_ar, y_prd)

plt.plot(range(-9, 11), mean_decay, label="mean")
plt.legend();

# Stage two, classification

In [101]:
from sklearn.cluster import *

In [102]:
from sklearn.decomposition import PCA

In [103]:
X.shape

(774, 33188)

In [104]:
X_trans = PCA(500).fit_transform(X)

In [105]:
clst_model = KMeans(n_clusters=2, n_jobs=-1)

In [106]:
clst_model.fit(X_trans)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [107]:
y_clst = clst_model.predict(X_trans)

In [108]:
X_trans.shape

(774, 500)

In [109]:
y_clst

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,

#### 1. 检查聚类是否有意义
#### 2. 如果有意义，在每个类别中分别建立NB模型，看单词权重，是否相同？
#### 3. 编写算法，筛选特征（信息熵 e.g.)

In [110]:
ztype_title_lst = [rst_lst[i][:2] for i in np.where(y_clst == 0)[0]]

In [111]:
ztype_title_lst[-20:]

[('Frameworks for the Countercyclical Capital Buffer ', '20190329'),
 ('Agriculture and Community Banking', '20190328'),
 ('Global Shocks and the U.S. Economy', '20190328'),
 ('Welcoming Remarks', '20190325'),
 ('Brief Remarks', '20190311'),
 ('Monetary Policy: Normalization and the Road Ahead', '20190308'),
 ('Navigating Cautiously', '20190307'),
 ('Recent Economic Developments and Longer-Term Challenges', '20190228'),
 ('U.S. Economic Outlook and Monetary Policy', '20190228'),
 ('Is Economics for Me? Increasing the Participation of Black Women in Economics',
  '20190223'),
 ("The Future of the Federal Reserve's Balance Sheet", '20190222'),
 ("The Federal Reserve's Review of Its Monetary Policy Strategy, Tools, and Communication Practices",
  '20190222'),
 ('Encouraging Economic Development in High-Poverty Rural Communities',
  '20190212'),
 ('A Conversation on Community Banking', '20190211'),
 ('Ideas of Order: Charting a Course for the Financial Stability Board',
  '20190210'),
 ('W

In [112]:
one_title_lst = [rst_lst[i][:2] for i in np.where(y_clst == 1)[0]]

In [113]:
one_title_lst[-20:]

[('Where Do Banks Fit in the Fintech Stack?', '20170428'),
 ('Departing Thoughts', '20170404'),
 ("America's Central Bank: The History and Structure of the Federal Reserve",
  '20170328'),
 ('Assessing Financial Stability over the Cycle', '20181207'),
 ("The Federal Reserve's Framework for Monitoring Financial Stability",
  '20181128'),
 ('A New Chapter in Stress Testing', '20181109'),
 ('FinTech and the Search for Full Stack Financial Inclusion', '20181017'),
 ('Trends in Urban and Rural Community Banks', '20181004'),
 ('Getting It Right: Factors for Tailoring Supervision and Regulation of Large Financial Institutions',
  '20180718'),
 ("America's Vital Interest in Global Efforts to Promote Financial Stability",
  '20180627'),
 ("Liquidity Regulation and the Size of the Fed's Balance Sheet", '20180504'),
 ('Safeguarding Financial Resilience through the Cycle', '20180419'),
 ("An Update on the Federal Reserve's Financial Stability Agenda", '20180403'),
 ('The Roles of Consumer Protecti

In [114]:
y = (rev_ar[:, 11] - rev_ar[:, 9])

In [115]:
cls_model_0 = GaussianNB()

In [116]:
cls_model_1 = GaussianNB()

In [117]:
X_0 = X[np.where(y_clst == 0)[0], :]
y_0 = np.sign(y[np.where(y_clst == 0)[0]])

In [118]:
X_1 = X[np.where(y_clst == 1)[0], :]
y_1 = np.sign(y[np.where(y_clst == 1)[0]])

In [119]:
fit_rst0 = cls_model_0.fit(X_0, y_0)

In [120]:
fit_rst1 = cls_model_1.fit(X_1, y_1)

In [121]:
dir(fit_rst0)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_joint_log_likelihood',
 '_partial_fit',
 '_update_mean_variance',
 'class_count_',
 'class_prior_',
 'classes_',
 'epsilon_',
 'fit',
 'get_params',
 'partial_fit',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'priors',
 'score',
 'set_params',
 'sigma_',
 'theta_',
 'var_smoothing']

In [122]:
fit_rst0.class_count_

array([255.,  44., 254.])

In [123]:
fit_rst0.class_prior_

array([0.46112116, 0.079566  , 0.45931284])

In [124]:
fit_rst0.sigma_.shape

(3, 33188)

In [126]:
X_df.columns

Index(['-', 'ability', 'abroad', 'absence', 'abundant', 'accelerated',
       'access', 'accession', 'accomplishments', 'account',
       ...
       'previewing', 'products--taking', 'regulators--state', 'respectful',
       'retelling', 'standards-development', 'state-based', 'usa--together',
       'usas', 'veer'],
      dtype='object', length=33188)

In [129]:
X_df.columns[np.argsort(-fit_rst0.theta_[0])[:100]]

Index(['inflation', 'policy', 'financial', 'federal', 'economic', 'market',
       'rate', 'monetary', 'percent', 'economy', 'growth', 'reserve', 'rates',
       'prices', 'credit', 'banks', 'markets', 'years', 'recent', 'time',
       'interest', 'important', 'year', 'bank', 'central', 'business', 'labor',
       'past', 'risk', 'funds', 'price', 'housing', 'capital', 'expectations',
       'low', 'crisis', 'demand', 'data', 'fomc', 'mortgage', 'conditions',
       'unemployment', 'real', 'community', 'small', 'system', 'employment',
       'level', 'current', 'recovery', 'stability', 'consumer', 'public',
       'balance', 'information', 'spending', 'states', 'global', 'future',
       'continue', 'consumers', 'risks', 'today', 'increase', 'firms',
       'investment', 'united', 'businesses', 'effects', 'lower', 'part',
       'make', 'development', 'large', 'work', 'high', 'provide', 'loans',
       'potential', 'securities', 'outlook', 'economies', 'asset', 'costs',
       'househo

In [130]:
X_df.columns[np.argsort(-fit_rst0.theta_[1])[:100]]

Index(['financial', 'policy', 'federal', 'inflation', 'rate', 'market',
       'economic', 'percent', 'reserve', 'economy', 'growth', 'monetary',
       'banks', 'credit', 'rates', 'interest', 'years', 'work', 'time', 'bank',
       'community', 'recent', 'important', 'unemployment', 'labor', 'low',
       'employment', 'risk', 'central', 'year', 'prices', 'stability',
       'markets', 'recovery', 'business', 'housing', 'institutions',
       'conditions', 'today', 'workers', 'crisis', 'past', 'high', 'consumer',
       'level', 'price', 'communities', 'mortgage', 'united', 'lower',
       'capital', 'future', 'states', 'investment', 'development', 'firms',
       'information', 'make', 'global', 'large', 'securities', 'productivity',
       'expectations', 'households', 'risks', 'data', 'increase', 'funds',
       'real', 'committee', 'job', 'cra', 'current', 'outlook', 'banking',
       'system', 'demand', 'including', 'support', 'significant', 'recession',
       'potential', 'peop

In [131]:
X_df.columns[np.argsort(-fit_rst0.theta_[2])[:100]]

Index(['inflation', 'policy', 'financial', 'federal', 'rate', 'economic',
       'market', 'monetary', 'growth', 'economy', 'percent', 'reserve',
       'rates', 'prices', 'banks', 'years', 'time', 'interest', 'important',
       'markets', 'recent', 'labor', 'bank', 'central', 'credit', 'capital',
       'price', 'year', 'expectations', 'system', 'crisis', 'low', 'risk',
       'unemployment', 'community', 'employment', 'states', 'past', 'united',
       'work', 'demand', 'real', 'current', 'funds', 'economies', 'data',
       'level', 'global', 'increase', 'potential', 'business', 'conditions',
       'fomc', 'stability', 'firms', 'lower', 'effects', 'today', 'housing',
       'public', 'mortgage', 'part', 'productivity', 'higher', 'future',
       'make', 'information', 'large', 'policies', 'high', 'continue',
       'institutions', 'risks', 'costs', 'countries', 'banking', 'investment',
       'output', 'significant', 'households', 'activity', 'including',
       'consumer', 'incre