## Portfolio Factor Analysis

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, KernelPCA
from sklearn.manifold import Isomap
from sklearn.linear_model import LinearRegression

In [2]:
# Function block

### REPRESENTATION LEARNING METHODS
def linPCA(train_data, tune_data, n_components):
    """
    Function to apply linear PCA with specified number of components. 
    Input:
        train_data (ndarray): dataset used to fit linear PCA model
        tune_data (ndarray): dataset transformed by linear PCA model
        n_components (int): hyperparameter specifying number of components
    Output:
        Z (ndarray): tune_data transformed to PCA subspace
    """
    pca = PCA(n_components=n_components)
    pca.fit(train_data)
    Z = pca.transform(tune_data)
    
    return Z

def KPCA(train_data, tune_data, n_components, kernel='poly'):
    """
    Function to apply linear PCA with specified number of components. 
    Input:
        train_data (ndarray): dataset used to fit kernel PCA model
        tune_data (ndarray): dataset transformed by kernel PCA model
        n_components (int): hyperparameter specifying number of components
        kernel (str): kernel method; choose from {"poly" | "rbf" | "sigmoid" | "cosine"}
    Output:
        Z (ndarray): tune_data transformed to PCA subspace
    """
    kpca = KernelPCA(n_components=n_components, kernel=kernel)
    kpca.fit(train_data)
    Z = kpca.transform(tune_data)
    
    return Z

def isomap(train_data, tune_data, n_components, n_neighbors):
    """
    Function to apply linear PCA with specified number of components. 
    Input:
        train_data (ndarray): dataset used to fit Isomap model
        tune_data (ndarray): dataset transformed by Isomap model
        n_components (int): hyperparameter specifying number of components
        n_neighbors (int): hyperparameter specifying number of neighbors to construct initial mapping
    Output:
        Z (ndarray): tune_data transformed to subspace
    """
    embedding = Isomap(n_components=n_components, n_neighbors=n_neighbors)
    embedding.fit(train_data)
    Z = embedding.transform(tune_data)
    
    return Z
    
    
### REGRESSION TEST
def LinRegression(X,y):
    """
    Runs simple linear regression on the dataset provided and returns the model's R^2 value.
    Input:
        X (ndarray): predictor data
        y (ndarray): outcome data
    Output:
        R^2 value for model (real)
    
    """
    reg = LinearRegression().fit(X,y)
    return reg.score(X,y)

### Import Data

Read in Excel file and specifically import net returns data

In [3]:
xls=pd.ExcelFile('Simple_Strategies_Returns.xlsx')
netreturns = pd.read_excel(xls, 1)

In [4]:
#Keep all data, but get rid of factors with missing data
netreturns = netreturns.dropna(axis=1).copy()
netreturns['Month'] = pd.to_datetime(netreturns['Month'], format = '%Y%m')
netreturns['Month'] = netreturns['Month'].dt.to_period('M')
netreturns['Month'] = netreturns['Month'].astype('str')

In [5]:
netreturns.shape

(606, 25)

Split data into 'new' and 'old' sections:  
'Old' section has 120 months of 26 factors  
'New' section has 486 months of 34 factors

In [6]:
# oldnetreturns = netreturns[netreturns.Month < 197307]
# oldnetreturns = oldnetreturns.reset_index(drop=True)
# oldnetreturns = oldnetreturns.dropna(axis=1)
# oldnetreturns['Month'] = pd.to_datetime(oldnetreturns['Month'], format = '%Y%m')
# oldnetreturns['Month'] = oldnetreturns['Month'].dt.to_period('M')
# oldnetreturns['Month'] = oldnetreturns['Month'].astype('str')

In [7]:
# newnetreturns = netreturns[netreturns.Month >= 197307]
# newnetreturns = newnetreturns.reset_index(drop=True)
# newnetreturns['Month'] = pd.to_datetime(newnetreturns['Month'], format = '%Y%m')
# newnetreturns['Month'] = newnetreturns['Month'].dt.to_period('M')
# newnetreturns['Month'] = newnetreturns['Month'].astype('str')

Fama-French Data

In [31]:
portfolios25 = pd.read_csv('25_Portfolios_5x5.CSV')
portfolios6 =pd.read_csv('6_Portfolios_2x3.CSV')

In [26]:
portfolios25.reset_index(inplace=True)

In [32]:
portfolios6['Month'] = pd.to_datetime(portfolios6['Month'], format = '%Y%m')
portfolios6['Month'] = portfolios6['Month'].dt.to_period('M')
portfolios6['Month'] = portfolios6['Month'].astype('str')

In [33]:
portfolios6.head()

Unnamed: 0,Month,SMALL LoBM,ME1 BM2,SMALL HiBM,BIG LoBM,ME2 BM2,BIG HiBM
0,1926-07,1.3724,0.9081,-0.0695,5.7168,1.9642,1.4222
1,1926-08,0.6095,1.5075,5.3842,2.7154,2.6838,6.3154
2,1926-09,-2.6779,-0.1359,-0.4374,1.4287,0.0709,-0.7967
3,1926-10,-3.5365,-4.3572,-2.0112,-3.5898,-2.346,-4.097
4,1926-11,3.121,3.6608,2.0944,3.1292,2.8965,3.4614


In [37]:
ff_5 = pd.read_csv('FF_5_Factors')

In [38]:
ff_5_monthly = ff_5.loc[:665,].copy()

In [39]:
# ff_5_yearly = ff_5.loc[668:,]
# ff_5_yearly.reset_index(inplace=True, drop=True)
# ff_5_yearly.rename(columns={"Month":"Year"}, inplace=True)

In [40]:
ff_5_monthly['Month'] = pd.to_datetime(ff_5_monthly['Month'], format = '%Y%m')
ff_5_monthly['Month'] = ff_5_monthly['Month'].dt.to_period('M')
ff_5_monthly['Month'] = ff_5_monthly['Month'].astype('str')

Company Stock Data

In [57]:
stocks = pd.read_csv('Company Stock Data.csv')

In [64]:
exxon = stocks[stocks.COMNAM== 'EXXON MOBIL CORP'].copy()
exxon['date'] = pd.to_datetime(exxon['date'], format = '%m/%d/%Y')
exxon['date'] = exxon.date.dt.to_period('M')
exxon['date'] = exxon['date'].astype('str')

In [65]:
exxon.head()

Unnamed: 0,PERMNO,date,TICKER,COMNAM,PRC,vwretd
0,11850,1963-07,XOM,EXXON MOBIL CORP,70.125,-0.001718
1,11850,1963-08,XOM,EXXON MOBIL CORP,71.5,0.052684
2,11850,1963-09,XOM,EXXON MOBIL CORP,68.5,-0.011898
3,11850,1963-10,XOM,EXXON MOBIL CORP,71.0,0.027708
4,11850,1963-11,XOM,EXXON MOBIL CORP,71.0,-0.005549


TODO:  
- Import S&P 500 returns [__DONE__]  
- Import Fama-French data [__DONE__]
- Build dimensionality reduction pipeline (to try different methods) [__DONE__]
- Build supervised learning pipeline [__DONE__]
- Train/dev/test split
- Proper timeseries regression

In [41]:
spx = pd.read_csv('CRSP SPX Index File.csv')
spx['caldt'] = pd.to_datetime(spx['caldt'], format = '%Y%m%d')
spx['caldt'] = spx.caldt.dt.to_period('M')
spx['caldt'] = spx['caldt'].astype('str')
spx = spx[(spx['caldt'] <= '2013-12')]

In [42]:
spx_tr = spx.iloc[:,0:2] #SPX Total Return
spx_pr = spx.iloc[:,[0,2]] #SPX Price Return
spx_tr.reset_index(inplace = True, drop = True) 
spx_pr.reset_index(inplace = True, drop = True)

In [43]:
# collects net return data, FF 5 factors, and S&P returns in one dataframe
together_df = netreturns.merge(portfolios6, left_on='Month',right_on='Month').merge(ff_5_monthly, left_on='Month',right_on='Month')

In [44]:
together_df.head()

Unnamed: 0,Month,Size,Gross Profitability,Value,ValProf,Accruals,Net Issuance (rebal.:A),Asset Growth,Investment,Piotroski's F-score,...,SMALL HiBM,BIG LoBM,ME2 BM2,BIG HiBM,Mkt-RF,SMB,HML,RMW,CMA,RF
0,1963-07,-0.63341,-1.635407,-2.726409,-1.690123,1.959792,2.992479,-1.38489,-3.240626,1.198153,...,-1.1591,-0.0296,0.4589,-1.5991,-0.39,-0.47,-0.83,0.66,-1.15,0.27
1,1963-08,-2.526682,2.418348,1.576539,0.455624,-1.563703,-1.844691,-1.91471,4.503954,-1.433401,...,5.7608,5.395,4.6761,7.4206,5.07,-0.79,1.67,0.39,-0.4,0.25
2,1963-09,0.819609,0.779151,-1.26868,2.072555,-2.173126,-0.756282,-2.203093,6.016384,-2.547664,...,-1.9386,-1.0517,-1.7246,-1.8235,-1.57,-0.48,0.18,-0.76,0.24,0.27
3,1963-10,-3.000215,1.445213,-3.422705,2.559109,10.87054,-0.660985,0.746974,3.599053,1.65989,...,2.6987,3.8542,1.2362,2.1259,2.53,-1.29,-0.1,2.75,-2.24,0.29
4,1963-11,-1.558811,-1.158534,0.779921,0.87766,-3.211279,-0.204558,3.671321,-1.514076,1.229205,...,-0.3295,-0.2497,-1.4575,0.6373,-0.85,-0.84,1.71,-0.45,2.22,0.27


### Rep. Learning and Evaluation

In [46]:
portfolios6.head()

Unnamed: 0,Month,SMALL LoBM,ME1 BM2,SMALL HiBM,BIG LoBM,ME2 BM2,BIG HiBM
0,1926-07,1.3724,0.9081,-0.0695,5.7168,1.9642,1.4222
1,1926-08,0.6095,1.5075,5.3842,2.7154,2.6838,6.3154
2,1926-09,-2.6779,-0.1359,-0.4374,1.4287,0.0709,-0.7967
3,1926-10,-3.5365,-4.3572,-2.0112,-3.5898,-2.346,-4.097
4,1926-11,3.121,3.6608,2.0944,3.1292,2.8965,3.4614


In [47]:
# Net returns dataset
X_train = netreturns.drop("Month", axis=1)

# Fama-French 3 and 5 factor dataset
X_ff5 = together_df[['SMB','HML','RMW','CMA',"Mkt-RF"]]
X_ff3 = together_df[['SMB','HML','Mkt-RF']]

# Research Portfolio Returns
y = together_df[['SMALL LoBM']]

Fama-French 3 Factors

In [49]:
LinRegression(X_ff3, y)

0.982761176605342

Fama-French 5 Factors

In [50]:
LinRegression(X_ff5, y)

0.9850459268587757

Linear PCA

In [51]:
k = 5
linpcaZ = linPCA(X_train, X_train, n_components=k)
LinRegression(linpcaZ, y)

0.4670865005237783

Kernel PCA

In [52]:
# Polynomial kernel
k = 5
poly_kpcaZ = KPCA(X_train, X_train, n_components=k, kernel='poly')
LinRegression(poly_kpcaZ, y)

0.11158477200102346

In [53]:
# RBF Kernel
k = 5
rbf_kpcaZ = KPCA(X_train, X_train, n_components=k, kernel='rbf')
LinRegression(rbf_kpcaZ, y)

0.004748239475597328

Isomap

In [54]:
k_comp = 5
k_nn = 9
isomapZ = isomap(X_train, X_train, n_components=k_comp, n_neighbors=k_nn)
LinRegression(isomapZ, y)

0.4743347076684491