## Portfolio Factor Analysis

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import Isomap
from sklearn.linear_model import LinearRegression

In [32]:
# Function block

### REPRESENTATION LEARNING METHODS
def linPCA(train_data, tune_data, n_components):
    """
    Function to apply linear PCA with specified number of components. 
    Input:
        train_data (ndarray): dataset used to fit linear PCA model
        tune_data (ndarray): dataset transformed by linear PCA model
        n_components (int): hyperparameter specifying number of components
    Output:
        Z (ndarray): tune_data transformed to PCA subspace
    """
    pca = PCA(n_components=n_components)
    pca.fit(train_data)
    Z = pca.transform(tune_data)
    
    return Z

def KPCA(train_data, tune_data, n_components, kernel='poly'):
    """
    Function to apply linear PCA with specified number of components. 
    Input:
        train_data (ndarray): dataset used to fit kernel PCA model
        tune_data (ndarray): dataset transformed by kernel PCA model
        n_components (int): hyperparameter specifying number of components
        kernel (str): kernel method; choose from {"poly" | "rbf" | "sigmoid" | "cosine"}
    Output:
        Z (ndarray): tune_data transformed to PCA subspace
    """
    kpca = KernelPCA(n_components=n_components, kernel=kernel)
    kpca.fit(train_data)
    Z = kpca.transform(tune_data)
    
    return Z

def isomap(train_data, tune_data, n_components, n_neighbors):
    """
    Function to apply linear PCA with specified number of components. 
    Input:
        train_data (ndarray): dataset used to fit Isomap model
        tune_data (ndarray): dataset transformed by Isomap model
        n_components (int): hyperparameter specifying number of components
        n_neighbors (int): hyperparameter specifying number of neighbors to construct initial mapping
    Output:
        Z (ndarray): tune_data transformed to subspace
    """
    embedding = Isomap(n_components=n_components, n_neighbors=n_neighbors)
    embedding.fit(train_data)
    Z = embedding.transform(tune_data)
    
    return Z
    
    
### REGRESSION TEST
def LinRegression(X,y):
    """
    Runs simple linear regression on the dataset provided and returns the model's R^2 value.
    Input:
        X (ndarray): predictor data
        y (ndarray): outcome data
    Output:
        R^2 value for model (real)
    
    """
    reg = LinearRegression().fit(X,y)
    return reg.score(X,y)

### Import Data

Read in Excel file and specifically import net returns data

In [33]:
xls=pd.ExcelFile('Simple_Strategies_Returns.xlsx')
netreturns = pd.read_excel(xls, 1)

In [34]:
#Keep all data, but get rid of factors with missing data
netreturns = netreturns.dropna(axis=1).copy()
netreturns['Month'] = pd.to_datetime(netreturns['Month'], format = '%Y%m')
netreturns['Month'] = netreturns['Month'].dt.to_period('M')
netreturns['Month'] = netreturns['Month'].astype('str')

In [35]:
netreturns.shape

(606, 25)

Split data into 'new' and 'old' sections:  
'Old' section has 120 months of 26 factors  
'New' section has 486 months of 34 factors

In [36]:
# oldnetreturns = netreturns[netreturns.Month < 197307]
# oldnetreturns = oldnetreturns.reset_index(drop=True)
# oldnetreturns = oldnetreturns.dropna(axis=1)
# oldnetreturns['Month'] = pd.to_datetime(oldnetreturns['Month'], format = '%Y%m')
# oldnetreturns['Month'] = oldnetreturns['Month'].dt.to_period('M')
# oldnetreturns['Month'] = oldnetreturns['Month'].astype('str')

In [37]:
# newnetreturns = netreturns[netreturns.Month >= 197307]
# newnetreturns = newnetreturns.reset_index(drop=True)
# newnetreturns['Month'] = pd.to_datetime(newnetreturns['Month'], format = '%Y%m')
# newnetreturns['Month'] = newnetreturns['Month'].dt.to_period('M')
# newnetreturns['Month'] = newnetreturns['Month'].astype('str')

Fama-French Data

In [38]:
ff_5 = pd.read_csv('FF_5_Factors')

In [39]:
ff_5_monthly = ff_5.loc[:665,].copy()

In [40]:
# ff_5_yearly = ff_5.loc[668:,]
# ff_5_yearly.reset_index(inplace=True, drop=True)
# ff_5_yearly.rename(columns={"Month":"Year"}, inplace=True)

In [41]:
ff_5_monthly['Month'] = pd.to_datetime(ff_5_monthly['Month'], format = '%Y%m')
ff_5_monthly['Month'] = ff_5_monthly['Month'].dt.to_period('M')
ff_5_monthly['Month'] = ff_5_monthly['Month'].astype('str')

TODO:  
- Import S&P 500 returns [__DONE__]  
- Import Fama-French data [__DONE__]
- Build dimensionality reduction pipeline (to try different methods) [__DONE__]
- Build supervised learning pipeline [__DONE__]
- Train/dev/test split
- Proper timeseries regression

In [42]:
spx = pd.read_csv('CRSP SPX Index File.csv')
spx['caldt'] = pd.to_datetime(spx['caldt'], format = '%Y%m%d')
spx['caldt'] = spx.caldt.dt.to_period('M')
spx['caldt'] = spx['caldt'].astype('str')
spx = spx[(spx['caldt'] <= '2013-12')]

In [43]:
spx_tr = spx.iloc[:,0:2] #SPX Total Return
spx_pr = spx.iloc[:,[0,2]] #SPX Price Return
spx_tr.reset_index(inplace = True, drop = True) 
spx_pr.reset_index(inplace = True, drop = True)

In [71]:
# collects net return data, FF 5 factors, and S&P returns in one dataframe
together_df = netreturns.merge(spx, left_on='Month',right_on='caldt').merge(ff_5_monthly, left_on='Month',right_on='Month')

### Rep. Learning and Evaluation

In [82]:
# Net returns dataset
X_train = netreturns.drop("Month", axis=1)

# Fama-French 3 and 5 factor dataset
X_ff5 = together_df[['SMB','HML','RMW','CMA',"Mkt-RF"]]
X_ff3 = together_df[['SMB','HML','Mkt-RF']]

# S&P 500 Returns
y = together_df[['Value Weighted Return (ex dividends)']]

Fama-French 3 Factors

In [83]:
LinRegression(X_ff3, y)

0.9904807362190499

Fama-French 5 Factors

In [84]:
LinRegression(X_ff5, y)

0.9912334497797756

Linear PCA

In [89]:
k = 5
linpcaZ = linPCA(X_train, X_train, n_components=k)
LinRegression(linpcaZ, y)

0.4205120182932881

Kernel PCA

In [60]:
# Polynomial kernel
k = 5
poly_kpcaZ = KPCA(X_train, X_train, n_components=k, kernel='poly')
LinRegression(poly_kpcaZ, y)

0.054728376862785866

In [61]:
# RBF Kernel
k = 5
rbf_kpcaZ = KPCA(X_train, X_train, n_components=k, kernel='rbf')
LinRegression(rbf_kpcaZ, y)

0.0034608272157408138

Isomap

In [70]:
k_comp = 5
k_nn = 9
isomapZ = isomap(X_train, X_train, n_components=k_comp, n_neighbors=k_nn)
LinRegression(isomapZ, y)

0.21745146537819238