In [1]:
import numpy as np
import pandas as pd
from scipy.stats import linregress
from scipy.signal import savgol_filter


Read in imputed series:

In [2]:
yield_data_imputed = pd.read_csv('Imputed Data.csv')
yield_data_imputed['Time Period'] = pd.to_datetime(yield_data_imputed['Time Period']).apply(lambda x:x.strftime('%Y-%m-%d'))
yield_data_imputed['Year'] = pd.to_datetime(yield_data_imputed['Time Period']).apply(lambda x:x.strftime('%Y'))
yield_data_imputed['Month'] = pd.to_datetime(yield_data_imputed['Time Period']).apply(lambda x:x.strftime('%m'))
yield_data_imputed.set_index('Time Period',inplace=True)
yield_data_imputed.index.name = None
yield_data_imputed.head()

Unnamed: 0,RIFLGFCY01_N.B,RIFLGFCY03_N.B,RIFLGFCY05_N.B,RIFLGFCY10_N.B,IS_WEEKEND,Day of week,Year,Month
1962-01-02,3.22,3.7,3.88,4.06,0,1,1962,1
1962-01-03,3.24,3.7,3.87,4.03,0,2,1962,1
1962-01-04,3.24,3.69,3.86,3.99,0,3,1962,1
1962-01-05,3.26,3.71,3.89,4.02,0,4,1962,1
1962-01-06,3.26,3.71,3.89,4.02,1,5,1962,1


Use [Savitzky–Golay filter](https://en.wikipedia.org/wiki/Savitzky%E2%80%93Golay_filter) to smooth the series(get rid of peaks in Monday and Friday). I use rolling window of 15 and power of 3

In [3]:
data_filtered = pd.DataFrame(savgol_filter(np.array(yield_data_imputed.iloc[:,:4]).transpose(),15,3).transpose())
data_filtered.columns = yield_data_imputed.iloc[:,:4].columns
data_filtered.index = yield_data_imputed.index
data_filtered.head()

Unnamed: 0,RIFLGFCY01_N.B,RIFLGFCY03_N.B,RIFLGFCY05_N.B,RIFLGFCY10_N.B
1962-01-02,3.218271,3.69666,3.871033,4.046082
1962-01-03,3.232123,3.697662,3.872517,4.029583
1962-01-04,3.245957,3.700262,3.876488,4.01987
1962-01-05,3.259459,3.704306,3.882534,4.016058
1962-01-06,3.272312,3.709638,3.890241,4.017264


**Define a rolling window function that calculate following statistic for one column:**  
1.Mean  
2.Standard Deviation  
3.Median  
4.Min  
5.Max  
6.Rooted Mean Square  
7.[Crest Factor](https://en.wikipedia.org/wiki/Crest_factor)  
8.[Zero-crossing Rate](https://en.wikipedia.org/wiki/Zero-crossing_rate)(here is Mean-crossing Rate)  
9.Trend(Slope of linear regression)

In [5]:
def rolling_method(df,col,window):
    method = ['mean','std','median','min','max']
    a = df[col].rolling(window=window)
    b = pd.concat([getattr(a,k)() for k in method],axis=1)
    b.columns  = method
    #b['RMS'] = a.apply(lambda y: np.sqrt(np.mean(y**2)))
    #b['crest'] = 0.5*(b['max']-b['min'])/b['RMS']
    #b['crossing'] = a.apply(lambda y: len(np.where(np.diff(y - np.mean(y)))[0]))/window
    #b['slope'] = a.apply(lambda y: linregress(range(window),y)[0])
    b.columns = [col + '_'+ k +'_'+str(window)+'day' 
                 for k in ['mean','std','median','min','max'#,'RMS','crest','crossing','slope'
                          ]]
    return b

Apply function to the yield data and choose rolling windows of 3,7,15 and 30 days, shift 1 in order to match the correct time stamp of target variable

In [6]:
#data_filtered['Date']=data_filtered.index.date
data_filtered.index

Index(['1962-01-02', '1962-01-03', '1962-01-04', '1962-01-05', '1962-01-06',
       '1962-01-07', '1962-01-08', '1962-01-09', '1962-01-10', '1962-01-11',
       ...
       '2017-09-26', '2017-09-27', '2017-09-28', '2017-09-29', '2017-09-30',
       '2017-10-01', '2017-10-02', '2017-10-03', '2017-10-04', '2017-10-05'],
      dtype='object', length=20366)

In [9]:
yield_data_features=pd.concat([rolling_method(data_filtered,i,j) \
                               for i in yield_data_imputed.columns[:4] \
                               for j in [3,7,15,30]],axis=1).shift(1)
yield_data_features.head(10)

Unnamed: 0,RIFLGFCY01_N.B_mean_3day,RIFLGFCY01_N.B_std_3day,RIFLGFCY01_N.B_median_3day,RIFLGFCY01_N.B_min_3day,RIFLGFCY01_N.B_max_3day,RIFLGFCY01_N.B_mean_7day,RIFLGFCY01_N.B_std_7day,RIFLGFCY01_N.B_median_7day,RIFLGFCY01_N.B_min_7day,RIFLGFCY01_N.B_max_7day,...,RIFLGFCY10_N.B_mean_15day,RIFLGFCY10_N.B_std_15day,RIFLGFCY10_N.B_median_15day,RIFLGFCY10_N.B_min_15day,RIFLGFCY10_N.B_max_15day,RIFLGFCY10_N.B_mean_30day,RIFLGFCY10_N.B_std_30day,RIFLGFCY10_N.B_median_30day,RIFLGFCY10_N.B_min_30day,RIFLGFCY10_N.B_max_30day
1962-01-02,,,,,,,,,,,...,,,,,,,,,,
1962-01-03,,,,,,,,,,,...,,,,,,,,,,
1962-01-04,,,,,,,,,,,...,,,,,,,,,,
1962-01-05,3.232117,0.013843,3.232123,3.218271,3.245957,,,,,,...,,,,,,,,,,
1962-01-06,3.245846,0.013668,3.245957,3.232123,3.259459,,,,,,...,,,,,,,,,,
1962-01-07,3.259243,0.013179,3.259459,3.245957,3.272312,,,,,,...,,,,,,,,,,
1962-01-08,3.271991,0.012375,3.272312,3.259459,3.284202,,,,,,...,,,,,,,,,,
1962-01-09,3.283775,0.011256,3.284202,3.272312,3.294812,3.258162,0.027813,3.259459,3.218271,3.294812,...,,,,,,,,,,
1962-01-10,3.294281,0.009824,3.294812,3.284202,3.303828,3.270385,0.026108,3.272312,3.232123,3.303828,...,,,,,,,,,,
1962-01-11,3.302817,0.007550,3.303828,3.294812,3.309810,3.281483,0.023516,3.284202,3.245957,3.309810,...,,,,,,,,,,


In [10]:
yield_data_features.columns

Index(['RIFLGFCY01_N.B_mean_3day', 'RIFLGFCY01_N.B_std_3day',
       'RIFLGFCY01_N.B_median_3day', 'RIFLGFCY01_N.B_min_3day',
       'RIFLGFCY01_N.B_max_3day', 'RIFLGFCY01_N.B_mean_7day',
       'RIFLGFCY01_N.B_std_7day', 'RIFLGFCY01_N.B_median_7day',
       'RIFLGFCY01_N.B_min_7day', 'RIFLGFCY01_N.B_max_7day',
       'RIFLGFCY01_N.B_mean_15day', 'RIFLGFCY01_N.B_std_15day',
       'RIFLGFCY01_N.B_median_15day', 'RIFLGFCY01_N.B_min_15day',
       'RIFLGFCY01_N.B_max_15day', 'RIFLGFCY01_N.B_mean_30day',
       'RIFLGFCY01_N.B_std_30day', 'RIFLGFCY01_N.B_median_30day',
       'RIFLGFCY01_N.B_min_30day', 'RIFLGFCY01_N.B_max_30day',
       'RIFLGFCY03_N.B_mean_3day', 'RIFLGFCY03_N.B_std_3day',
       'RIFLGFCY03_N.B_median_3day', 'RIFLGFCY03_N.B_min_3day',
       'RIFLGFCY03_N.B_max_3day', 'RIFLGFCY03_N.B_mean_7day',
       'RIFLGFCY03_N.B_std_7day', 'RIFLGFCY03_N.B_median_7day',
       'RIFLGFCY03_N.B_min_7day', 'RIFLGFCY03_N.B_max_7day',
       'RIFLGFCY03_N.B_mean_15day', 'RIFLGFCY03_N.B

In [17]:
yield_data_features.dropna(axis=0,how='any',inplace=True)
yield_data_features['Time Period']=yield_data_features.index


Unnamed: 0,RIFLGFCY01_N.B_mean_3day,RIFLGFCY01_N.B_std_3day,RIFLGFCY01_N.B_median_3day,RIFLGFCY01_N.B_min_3day,RIFLGFCY01_N.B_max_3day,RIFLGFCY01_N.B_mean_7day,RIFLGFCY01_N.B_std_7day,RIFLGFCY01_N.B_median_7day,RIFLGFCY01_N.B_min_7day,RIFLGFCY01_N.B_max_7day,...,RIFLGFCY10_N.B_std_15day,RIFLGFCY10_N.B_median_15day,RIFLGFCY10_N.B_min_15day,RIFLGFCY10_N.B_max_15day,RIFLGFCY10_N.B_mean_30day,RIFLGFCY10_N.B_std_30day,RIFLGFCY10_N.B_median_30day,RIFLGFCY10_N.B_min_30day,RIFLGFCY10_N.B_max_30day,Time Period
1962-02-01,3.289511,0.004171,3.290054,3.285095,3.293385,3.278233,0.011647,3.277457,3.263448,3.293385,...,0.003285,4.109421,4.099819,4.111493,4.081659,0.035600,4.104299,4.016058,4.111493,1962-02-01
1962-02-02,3.292024,0.001747,3.292633,3.290054,3.293385,3.282402,0.010654,3.285095,3.266715,3.293385,...,0.005018,4.108679,4.093158,4.111493,4.083228,0.035010,4.104299,4.016058,4.111493,1962-02-02
1962-02-03,3.293644,0.001162,3.293385,3.292633,3.294914,3.286431,0.008925,3.290054,3.271475,3.294914,...,0.006812,4.108643,4.088434,4.111493,4.085190,0.033517,4.104299,4.016058,4.111493,1962-02-03
1962-02-04,3.294727,0.002007,3.294914,3.292633,3.296633,3.290025,0.006682,3.292633,3.277457,3.296633,...,0.008766,4.106208,4.082715,4.111321,4.087285,0.031176,4.104299,4.016058,4.111493,1962-02-04
1962-02-05,3.296211,0.001146,3.296633,3.294914,3.297086,3.292829,0.004179,3.293385,3.285095,3.297086,...,0.010902,4.105792,4.076543,4.110389,4.089301,0.028227,4.104299,4.017264,4.111493,1962-02-05
1962-02-06,3.296664,0.000408,3.296633,3.296271,3.297086,3.294425,0.002550,3.294914,3.290054,3.297086,...,0.013189,4.104643,4.070462,4.110389,4.091074,0.025037,4.104299,4.022604,4.111493,1962-02-06
1962-02-07,3.295849,0.001493,3.296271,3.294190,3.297086,3.295016,0.001708,3.294914,3.292633,3.297086,...,0.015204,4.103955,4.067810,4.110389,4.092581,0.021943,4.104299,4.031196,4.111493,1962-02-07
1962-02-08,3.293062,0.003898,3.294190,3.288724,3.296271,3.294350,0.002925,3.294914,3.288724,3.297086,...,0.017041,4.099819,4.064543,4.110389,4.093693,0.019426,4.104299,4.042154,4.111493,1962-02-08
1962-02-09,3.288881,0.005233,3.288724,3.283729,3.294190,3.293078,0.004998,3.294914,3.283729,3.297086,...,0.018508,4.093158,4.061412,4.110389,4.094335,0.017925,4.104299,4.055747,4.111493,1962-02-09
1962-02-10,3.284914,0.003377,3.283729,3.282290,3.288724,3.291275,0.006326,3.294190,3.282290,3.297086,...,0.019644,4.088434,4.057077,4.110389,4.094379,0.017827,4.104299,4.057077,4.111493,1962-02-10


In [19]:
yield_data_features['Year'] = pd.to_datetime(yield_data_features['Time Period']).apply(lambda x:x.strftime('%Y'))
yield_data_features['Month'] = pd.to_datetime(yield_data_features['Time Period']).apply(lambda x:x.strftime('%m'))
yield_data_features.columns


Index(['RIFLGFCY01_N.B_mean_3day', 'RIFLGFCY01_N.B_std_3day',
       'RIFLGFCY01_N.B_median_3day', 'RIFLGFCY01_N.B_min_3day',
       'RIFLGFCY01_N.B_max_3day', 'RIFLGFCY01_N.B_mean_7day',
       'RIFLGFCY01_N.B_std_7day', 'RIFLGFCY01_N.B_median_7day',
       'RIFLGFCY01_N.B_min_7day', 'RIFLGFCY01_N.B_max_7day',
       'RIFLGFCY01_N.B_mean_15day', 'RIFLGFCY01_N.B_std_15day',
       'RIFLGFCY01_N.B_median_15day', 'RIFLGFCY01_N.B_min_15day',
       'RIFLGFCY01_N.B_max_15day', 'RIFLGFCY01_N.B_mean_30day',
       'RIFLGFCY01_N.B_std_30day', 'RIFLGFCY01_N.B_median_30day',
       'RIFLGFCY01_N.B_min_30day', 'RIFLGFCY01_N.B_max_30day',
       'RIFLGFCY03_N.B_mean_3day', 'RIFLGFCY03_N.B_std_3day',
       'RIFLGFCY03_N.B_median_3day', 'RIFLGFCY03_N.B_min_3day',
       'RIFLGFCY03_N.B_max_3day', 'RIFLGFCY03_N.B_mean_7day',
       'RIFLGFCY03_N.B_std_7day', 'RIFLGFCY03_N.B_median_7day',
       'RIFLGFCY03_N.B_min_7day', 'RIFLGFCY03_N.B_max_7day',
       'RIFLGFCY03_N.B_mean_15day', 'RIFLGFCY03_N.B

In [None]:
### RF code below. Cant run because y value missing in features above. Can you append those?

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split




X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=400,
                                                    random_state=4)

max_depth = 30

regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=2)
regr_rf.fit(X_train, y_train)

# Predict on new data
y_rf = regr_rf.predict(X_test)

# Plot the results
plt.figure()
s = 50
a = 0.4
plt.scatter(y_test[:, 0], y_test[:, 1], edgecolor='k',
            c="navy", s=s, marker="s", alpha=a, label="Data")
plt.scatter(y_rf[:, 0], y_rf[:, 1], edgecolor='k',
            c="c", s=s, marker="^", alpha=a,
            label="RF score=%.2f" % regr_rf.score(X_test, y_test))
plt.xlim([-6, 6])
plt.ylim([-6, 6])
plt.xlabel("target 1")
plt.ylabel("target 2")
plt.legend()
plt.show()