## Portfolio Factor Analysis

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import Isomap
from sklearn.linear_model import LinearRegression

### Import Data

Read in Excel file and specifically import net returns data

In [62]:
xls=pd.ExcelFile('Simple_Strategies_Returns.xlsx')
netreturns = pd.read_excel(xls, 1)

In [63]:
#Keep all data, but get rid of factors with missing data
netreturns = netreturns.dropna(axis=1).copy()
netreturns['Month'] = pd.to_datetime(netreturns['Month'], format = '%Y%m')
netreturns['Month'] = netreturns['Month'].dt.to_period('M')
netreturns['Month'] = netreturns['Month'].astype('str')

In [64]:
netreturns.shape

(606, 25)

Split data into 'new' and 'old' sections:  
'Old' section has 120 months of 26 factors  
'New' section has 486 months of 34 factors

In [109]:
# oldnetreturns = netreturns[netreturns.Month < 197307]
# oldnetreturns = oldnetreturns.reset_index(drop=True)
# oldnetreturns = oldnetreturns.dropna(axis=1)
# oldnetreturns['Month'] = pd.to_datetime(oldnetreturns['Month'], format = '%Y%m')
# oldnetreturns['Month'] = oldnetreturns['Month'].dt.to_period('M')
# oldnetreturns['Month'] = oldnetreturns['Month'].astype('str')

In [110]:
# newnetreturns = netreturns[netreturns.Month >= 197307]
# newnetreturns = newnetreturns.reset_index(drop=True)
# newnetreturns['Month'] = pd.to_datetime(newnetreturns['Month'], format = '%Y%m')
# newnetreturns['Month'] = newnetreturns['Month'].dt.to_period('M')
# newnetreturns['Month'] = newnetreturns['Month'].astype('str')

Fama-French Data

In [7]:
ff_3 = pd.read_csv('FF_3_Factors')

In [56]:
ff_3_monthly = ff_3.iloc[:1110,].copy()

In [58]:
# ff_3_yearly = ff_3.loc[1112:,]
# ff_3_yearly.drop(1204, inplace=True)
# ff_3_yearly.reset_index(inplace=True, drop=True)
# ff_3_yearly.rename(columns={"Month":"Year"}, inplace=True)

In [59]:
ff_3_monthly['Month'] = pd.to_datetime(ff_3_monthly['Month'], format = '%Y%m')
ff_3_monthly['Month'] = ff_3_monthly['Month'].dt.to_period('M')
ff_3_monthly['Month'] = ff_3_monthly['Month'].astype('str')

In [60]:
ff_3_monthly.head()

Unnamed: 0,Month,Mkt-RF,SMB,HML,RF
0,1926-07,2.96,-2.3,-2.87,0.22
1,1926-08,2.64,-1.4,4.19,0.25
2,1926-09,0.36,-1.32,0.01,0.23
3,1926-10,-3.24,0.04,0.51,0.32
4,1926-11,2.53,-0.2,-0.35,0.31


In [13]:
ff_5 = pd.read_csv('FF_5_Factors')

In [53]:
ff_5_monthly = ff_5.loc[:665,].copy()

In [61]:
# ff_5_yearly = ff_5.loc[668:,]
# ff_5_yearly.reset_index(inplace=True, drop=True)
# ff_5_yearly.rename(columns={"Month":"Year"}, inplace=True)

In [54]:
ff_5_monthly['Month'] = pd.to_datetime(ff_5_monthly['Month'], format = '%Y%m')
ff_5_monthly['Month'] = ff_5_monthly['Month'].dt.to_period('M')
ff_5_monthly['Month'] = ff_5_monthly['Month'].astype('str')

TODO:  
- Import S&P 500 returns [__DONE__]  
- Import Fama-French data [__DONE__]
- Build dimensionality reduction pipeline (to try different methods)
- Build supervised learning pipeline (i.e., timeseries regression of reduced data on S&P returns)
- Train/dev/test split

In [68]:
spx = pd.read_csv('CRSP SPX Index File.csv')
spx['caldt'] = pd.to_datetime(spx['caldt'], format = '%Y%m%d')
spx['caldt'] = spx.caldt.dt.to_period('M')
spx['caldt'] = spx['caldt'].astype('str')
spx = spx[(spx['caldt'] <= '2013-12')]

In [70]:
spx_tr = spx.iloc[:,0:2] #SPX Total Return
spx_pr = spx.iloc[:,[0,2]] #SPX Price Return
spx_tr.reset_index(inplace = True, drop = True) 
spx_pr.reset_index(inplace = True, drop = True)

In [72]:
# collects net return data, FF 5 factors, and S&P returns in one dataframe
together_df = netreturns.merge(spx, left_on='Month',right_on='caldt').merge(ff_5_monthly, left_on='Month',right_on='Month')
together_df.shape

(606, 35)

### Rep. Learning Options

DROP MONTHS

Linear PCA

In [26]:
k = 5
pca = PCA(n_components=k)
linpca_newnet = pca.fit_transform(netreturns.drop("Month", axis=1))

In [27]:
linpca_newnet.shape

(606, 5)

Kernel PCA

In [139]:
k = 5
poly_kpca = KernelPCA(n_components=k, kernel='poly')
poly_pca_newnet = poly_kpca.fit_transform(newnetreturns.drop("Month", axis=1))

In [140]:
poly_pca_newnet.shape

(486, 5)

In [141]:
k = 5
rbf_kpca = KernelPCA(n_components=k, kernel='rbf')
rbf_pca_newnet = rbf_kpca.fit_transform(newnetreturns.drop("Month", axis=1))

In [142]:
rbf_pca_newnet.shape

(486, 5)

Isomap

In [162]:
k = 5
embedding = Isomap(n_components=k, n_neighbors=5)
isomap_newnet = embedding.fit_transform(newnetreturns.drop("Month", axis=1))

In [163]:
isomap_newnet.shape

(486, 5)

### Regression Pipeline

In [123]:
# FF factors with SPX Price Returns
ff_spx_pr = ff_5_monthly.merge(spx_pr, left_on='Month', right_on='caldt')

In [124]:
ff_spx_pr.head()

Unnamed: 0,Month,Mkt-RF,SMB,HML,RMW,CMA,RF,caldt,Value Weighted Return (ex dividends)
0,1973-07,5.05,7.25,-5.31,-0.1,-3.45,0.64,1973-07,0.037679
1,1973-08,-3.82,-1.75,1.14,-1.36,1.2,0.7,1973-08,-0.036505
2,1973-09,4.75,3.54,2.18,-2.3,1.82,0.68,1973-09,0.041038
3,1973-10,-0.83,-0.26,1.74,-1.94,2.59,0.65,1973-10,-0.00156
4,1973-11,-12.75,-7.25,4.04,-2.71,1.49,0.56,1973-11,-0.113046


#### FF 5 Factor Model

In [152]:
# what are the actual FF factors? There are six here - maybe Mkt-RF is wrong to include?
X = ff_spx_pr[['SMB','HML','RMW','CMA','RF']]
y = ff_spx_pr[['Value Weighted Return (ex dividends)']]

In [153]:
reg = LinearRegression().fit(X,y)
reg.score(X,y)
#Score gives R^2

0.1778487802172506

#### FF 3 Factor Model

In [154]:
X = ff_spx_pr[['SMB','HML','RF']]
y = ff_spx_pr[['Value Weighted Return (ex dividends)']]

In [155]:
reg = LinearRegression().fit(X,y)
reg.score(X,y)

0.07054084862918186

#### LinPCA Model

In [156]:
X = pd.DataFrame(linpca_newnet)
y = ff_spx_pr[['Value Weighted Return (ex dividends)']]

In [157]:
reg = LinearRegression().fit(X,y)
reg.score(X,y)

0.25255652076426816

#### PolyKPCA Model

In [158]:
X = pd.DataFrame(poly_pca_newnet)
y = ff_spx_pr[['Value Weighted Return (ex dividends)']]

In [159]:
reg = LinearRegression().fit(X,y)
reg.score(X,y)

0.07769071593258792

#### RBFKPCA Model

In [160]:
X = pd.DataFrame(rbf_pca_newnet)
y = ff_spx_pr[['Value Weighted Return (ex dividends)']]

In [161]:
reg = LinearRegression().fit(X,y)
reg.score(X,y)

0.004753269907649904

#### Isomap Model

In [164]:
X = pd.DataFrame(isomap_newnet)
y = ff_spx_pr[['Value Weighted Return (ex dividends)']]

In [165]:
reg = LinearRegression().fit(X,y)
reg.score(X,y)

0.19593151170180245