In [14]:
import numpy as np
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [15]:
theSedols = pd.read_csv("data/sedols.csv")

In [21]:
predictions = pd.read_csv("output/predictions.csv", index_col=[0, 1, 2]).sort_index()

In [26]:
stock_returns = pd.read_csv(
    "./data/cleaned_return_data.csv", 
    # nrows=3, 
    parse_dates=['DATE'], 
    index_col=0
).fillna(method='ffill').fillna(0)

In [31]:
stock_returns.loc["2006-01-01", "BZBY21"]

0.0

In [33]:
predictions.loc["2006-01-01", "AdaBoost"]

Unnamed: 0_level_0,RETURN
SEDOL,Unnamed: 1_level_1
200001,0.139706
200169,0.793067
200230,0.845063
200247,0.758403
200418,0.139706
...,...
BZ12WP,0.502101
BZ2Y3W,0.502101
BZ8FLW,0.502101
BZB2K9,0.502101


In [34]:
stock_returns.loc["2006-01-01"].multiply(predictions.loc["2006-01-01", "AdaBoost"].RETURN).fillna(0)

200001   -0.148507
200169   -0.382258
200230    0.152956
200247   -0.019718
200291    0.000000
            ...   
BZB2K9    1.483206
BZBY20   -0.095279
BZBY21    0.000000
BZBYZY    0.000000
BZBZ02    0.000000
Length: 1994, dtype: float64

In [23]:
predictions.loc["2006-01-01", "AdaBoost"].prod

Unnamed: 0_level_0,RETURN
SEDOL,Unnamed: 1_level_1
200001,0.139706
200169,0.793067
200230,0.845063
200247,0.758403
200418,0.139706
...,...
BZ12WP,0.502101
BZ2Y3W,0.502101
BZ8FLW,0.502101
BZB2K9,0.502101


# FACTORS

In [4]:
factors = pd.read_csv(
    './data/rus1000_stocks_factors.csv', 
    on_bad_lines='skip', 
    header = 2, 
    # nrows = 10000, 
    low_memory=False, 
    converters={'SEDOL': (lambda x: x[:6])},
    parse_dates=['DATE'], 
    index_col=[3, 2]
).groupby(
    ['Symbol', 'DATE']
).fillna(
    method='ffill'
).sort_index()

In [4]:
factors.head()

NameError: name 'factors' is not defined

In [6]:
factor_sedols = factors.index.get_level_values("SEDOL").unique()

# STOCK RETURNS

In [24]:
stock_returns = pd.read_csv(
    "./data/cleaned_return_data_sc.csv", 
    # nrows=3, 
    parse_dates=['DATE'], 
    index_col=0
).fillna(method='ffill').fillna(0)

In [8]:
return_sedols = stock_returns.columns

In [18]:
stock_returns.head()

Unnamed: 0_level_0,000124,000163,000191,000312,000371,000415,000424,000432,000445,000495,...,BZCRNM,BZCTKP,BZHJN8,BZHJN9,BZHJNF,BZHJNG,BZHJNJ,BZHJNL,BZHJNM,BZHJVR
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1995-01-01,-0.119,0.153,,0.484,,0.535,,0.173,0.062,0.565,...,,,,,,,,,,
1995-02-01,3.014,-0.719,,-0.196,,0.562,,-0.143,0.096,0.554,...,,,,,,,,,,
1995-03-01,0.372,0.695,,0.602,,0.476,,0.084,0.942,0.855,...,,,,,,,,,,
1995-04-01,-0.215,0.403,,0.899,,-0.112,,0.461,-0.078,0.86,...,,,,,,,,,,
1995-05-01,-0.127,1.186,,-0.962,,0.127,,-0.33,-0.101,-0.45,...,,,,,,,,,,


In [3]:
stock_returns.index

NameError: name 'stock_returns' is not defined

# FACTOR-RETURN CONSISTENCY

In [9]:
common_sedols = []
for s in return_sedols:
    if s in factor_sedols:
        common_sedols += [s]

In [None]:
sedols = pd.Series(common_sedols, name="SEDOLS")
sedols.to_csv("data/sedols.csv", index=False)

In [13]:
factors.loc[(common_sedols[:300],), :].to_csv("data/rus1000_stocks_factors_subset.csv")

In [14]:
stock_returns[common_sedols].to_csv("data/cleaned_return_data.csv")

# BENCHMARK RETURNS

In [25]:
benchmark_returns = pd.read_csv(
    './data/Benchmark Returns.csv', 
    on_bad_lines='skip', 
    # nrows = 100, 
    low_memory=False, 
    parse_dates=['Date'], 
    index_col=[0]
)
benchmark_returns.index.name = "DATE"

In [26]:
benchmark_returns

Unnamed: 0_level_0,MSCI EM Bench Return,Russell 1000 Bench Return,MSCI ACWIXUS Bench Return
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2002-12-31,-0.033226,-0.056599,-0.032306
2003-01-31,-0.004354,-0.024234,-0.035107
2003-02-28,-0.026990,-0.015472,-0.020262
2003-03-31,-0.028355,0.010351,-0.019398
2003-04-30,0.089072,0.080728,0.096360
...,...,...,...
2019-04-30,0.021241,0.040384,0.027170
2019-05-31,-0.072246,-0.063724,-0.052596
2019-06-28,0.063223,0.070204,0.060711
2019-07-31,-0.011400,0.015530,-0.011777


# MISC

In [10]:
stock_returns = pd.read_csv(
        "data/cleaned_return_data.csv", 
        # parse_dates=["DATE"],
        index_col=[0], 
        converters={"DATE": lambda x: pd.to_datetime(x) + pd.offsets.MonthBegin(1)}
    )

In [12]:
stock_returns

Unnamed: 0_level_0,200001,200169,200230,200247,200291,200369,200418,200582,200597,200784,...,BZ6VT8,BZ8FLW,BZ8VC5,BZ8VJQ,BZ9NZY,BZB2K9,BZBY20,BZBY21,BZBYZY,BZBZ02
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1995-02-01,0.000,0.332,0.482,-0.304,0.432,0.000,0.000,0.000,0.037,0.848,...,0.000,0.023,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000
1995-03-01,0.000,0.346,0.251,0.436,0.078,0.000,0.000,0.000,0.295,-0.040,...,0.000,-0.833,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000
1995-04-01,0.000,0.307,-0.149,-0.564,-0.358,0.000,0.000,0.000,0.586,0.794,...,0.000,0.753,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000
1995-05-01,0.000,0.033,0.186,-0.351,0.011,0.000,0.000,0.000,0.556,0.044,...,0.000,1.151,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000
1995-06-01,0.000,-0.093,-0.036,-0.048,-0.014,0.000,0.000,0.000,-0.298,-0.889,...,0.000,-0.088,0.000,0.000,0.000,0.0,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-09-01,-0.158,5.981,0.078,-0.425,0.456,-2.051,-1.376,-0.232,-0.315,-0.213,...,0.636,1.111,-0.585,1.432,-0.066,-4.4,-0.435,-2.483,-1.591,-0.464
2019-10-01,-0.398,5.981,-0.368,0.384,-0.172,-0.891,0.516,-0.232,0.447,-0.888,...,-0.143,-1.590,-0.585,0.477,-0.066,-4.4,0.488,0.172,1.063,0.921
2019-11-01,-0.023,5.981,-0.198,0.222,1.212,1.220,0.101,-0.232,-0.923,1.250,...,0.945,0.246,-0.585,-0.129,-0.066,-4.4,-0.473,-4.143,-0.433,-0.646
2019-12-01,-0.004,5.981,-0.002,0.082,-0.257,0.032,-0.102,-0.232,-0.009,0.146,...,-0.012,0.041,-0.585,-0.041,-0.066,-4.4,0.106,-0.121,0.085,-0.002
