In [1]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
from cvxpy import * 

In [2]:
def train_test_split(data, start_year):
    '''
    Split the data up into training and test periods
    
    Args: 
        data: a n stocks by t periods data-frame with a date-time index
        period: an integer representing which period that should be used
        
    Returns:
        tuple of training and testing data as pandas dataframes
        
    Usage: 
        training1, testing1 = train_test_split(stocks, 1)
    '''
    
    training = data[str(start_year):str(start_year + 4)]
    testing = data[str(start_year + 5)]
    return(training, testing)

In [3]:
def read_stocks():
    '''
    Get stock data for performing optimization. 
    Assumes data is in a t-by-n format with t months of observations
    along the rows and n stocks to choose from in columns.
    Assumes no missing values.
    
    Returns:
         Monthly returns data in a t-by-n formated dataframe with 
         a time-formatted index
    '''
    
    df = pd.read_csv('../../data/monthly_return.csv', names = ['s_' + str(x+1) for x in range(556)])
    df['date'] = pd.date_range('1/1/1986', periods=360, freq='M')
    df.set_index('date', inplace=True)
    
    return(df)

In [4]:
def read_factors():
    '''
    Get fama and french factor data for performing optimization. 
    Assumes data is in a t-by-k format with t months of observations
    along the rows and k factors to choose from in columns.
    Assumes no missing values.
    
    Returns:
         Monthly returns data in a t-by-n formated dataframe with 
         a time-formatted index
    '''
    
    df = pd.read_csv('../../data/F-F_Research_Data_Factors.csv', skiprows=3, parse_dates=True, nrows=1088)
    keep_dates = (df.loc[:,'Unnamed: 0'] >= 198601) & (df.loc[:,'Unnamed: 0'] <= 201512)
    df = df.loc[keep_dates, ['Mkt-RF', 'SMB', 'HML']]
    df['date'] = pd.date_range('1/1/1986', periods=360, freq='M')
    df.set_index('date', inplace=True)
    
    return(df)

In [5]:
def fama_and_french(stocks, factors):
    '''
    Calculate factor loadings, factor covariance, and idiosyncratic risk 
    for a single training period
    
    Args:
        stocks: stocks for a single period as a t by n numpy array (this is training period)
        factors: factors for a single period as a t by 3 (three factors) numpy array (this is training period)
    
    Returns:
        dictionary including factor loadings (n by k), factor covariance (k by k) and idiosyncratic risk (n by n) 
    '''
    
    # Number of stocks
    n = stocks.shape[1]
    
    # Factor loadings
    F = np.zeros(shape = (n, 4))

    # Idiosyncratic risk
    D = np.diag(np.zeros(shape = n))

    # Define the input for the regression
    X = factors
    X = sm.add_constant(X)

    # Loop through the stocks and calculate coefficients
    for i in range(n):

        # Select new stock each time
        y_i = stocks.iloc[:,i] 
        model_i = sm.OLS(y_i, X).fit() 

        # Including the alpha term and all three betas
        F[i,:] = model_i.params.values

        # Numerator is: (60 months) - (3 factors) + (1 constant)
        D[i,i] = np.sum(y_i - model_i.predict(X)) / 58 

    # The Factor Covariance Matrix
    Sigma_tilde = np.cov(F, rowvar=False)
    
    return(F, Sigma_tilde, D)

In [7]:
def get_equal_weights(n):
    '''
    Get a vector of equal weights
    
    Args:
        n: How many stocks are in your portfolio
        
    Returns:
        a n-by-1 matrix of 1/n weights
    '''
    
    x = np.asmatrix(np.ones(shape = (n, 1))) / n
    return(x)

In [8]:
def ret_cov_est(training):
    '''
    Estimate the returns and covariance for the training period.
    
    Args:
        training: a n stocks by t periods data-frame with a date-time index
    
    Returns:
        A tuple with a n-by-1 matrix of estimated returns 
        and a n-by-n matrix of estimated covariance. n is the number of stocks.
    '''
    r_hat = np.asmatrix(np.mean(training)).T
    Sigma = np.asmatrix(np.cov(training, rowvar=False))
    return(r_hat, Sigma)

In [9]:
def min_variance(r_hat, Sigma, tau, mu = None):
    '''
    For a given estimated return floor, get the weights that 
    minimize the variance with an l1 norm of the weights.
    
    Args:
        r_hat: estimated returns as a n-by-1 matrix
        Sigma: estimated covariance as a n-by-n matrix
        tau: tuning parameter. (Larger values promote more sparsity.)
        mu: the minimum return that the portfolio must beat
        
    Returns:
        optimal weights as a n-by-1 matrix
    '''
    
    # Define the variables
    n = r_hat.shape[0]
    x = Variable(n)
    ret = r_hat.T*x 
    risk = quad_form(x, Sigma)
    
    # Define the problem: Minimize variance for given returns threshold
    objective = Minimize(risk + tau*norm(x, 1))
    constraints = [sum_entries(x) == 1, x >= 0, ret >= mu]
    
    # Solve the problem
    prob = Problem(objective, constraints)
    prob.solve()
    
    # Get the values of interest
    minimal_risk = risk.value
    optimal_x = x.value
    
    # Handling rounding of x's
    optimal_x = np.around(optimal_x, decimals = 4)
    optimal_x =  np.asmatrix(optimal_x / sum(optimal_x))
    
    return(optimal_x)

In [10]:
def min_variance_factor(r_hat, F, Sigma_tilde, D, mu):
    
    # Define the variables
    n = F.shape[0]
    x = Variable(n)   # The weights
    f = F.T*x         # The factor loadings
    ret = r_hat.T*x 
    risk = quad_form(f, Sigma_tilde) + quad_form(x, D)
    
    # Solve the problem
    # prob = Problem(Maximize(ret - 2*risk), [sum_entries(x) == 1, x >= 0, ret >= mu])
    prob = Problem(Minimize(risk), [sum_entries(x) == 1, x >= 0, ret >= mu])
    prob.solve()
    
    # Get the values of interest
    minimal_risk = risk.value
    optimal_x = x.value
    
    # Handling rounding of x's
    optimal_x = np.around(optimal_x, decimals = 4)
    optimal_x =  np.asmatrix(optimal_x / sum(optimal_x))
    
    return(optimal_x)

In [11]:
def format_results(results, test, strategy, num_stocks):
    '''
    Format the results of the risk, returns into a data-frame 
    for plotting later
    
    Args: 
        results: numpy array with returns, risk results
        test: 1 or 0 depending on whether we are talking about in or 
                out of sample performance 
        strategy: string indicating which type of optimization approach was used
        num_stocks: the number of stocks selected for the portfolio
        
    Returns: 
        Dataframe with the following columns:
            return: the return for the portfolio
            risk: the risk for the portfolio
            strategy: see above for definition
            year: if training, the start year of the training period, 
                    if testing the evaluation year
            test: binary, 1 for out-of-sample, 0 for in-sample
            period: one of 6 values indicating which period of six years 
                        its in
            num_stocks: the number of stocks selected for the portfolio
    '''
    years = np.arange(1986, 2010 + 1)
    df = pd.DataFrame(results, columns = ['Return', 'Risk'])
    df['Sharpe'] = df['Return'] / df['Risk']
    df['strategy'] = strategy
    df['year'] = years + 6 if test == 1 else years
    df['test'] = test
    df['period'] = np.repeat(np.array([1, 2, 3, 4, 5]), 5)
    df['num_stocks'] = num_stocks
    return(df)

In [35]:
def risk_return(x, data):
    '''
    Calculate the risk, returns for a portfolio with a given set of weights
    
    Args:
        x: wieghts to be applied for the portfolio as a n-by-1 matrix
        data: an t-by-n pandas dataframe or numpy matrix.
        
    Returns:
        A tuple of two floats: one the risk and one the return
    '''
    
    r_hat = np.asmatrix(np.mean(data)).T
    Sigma = np.asmatrix(np.cov(data, rowvar=False))
    ret = (r_hat.T * x)[0,0]
    risk = (x.T * Sigma * x)[0,0]
    return(ret, risk)

In [None]:
# Set constants
START_YEAR = 1986
TAU = 1

# Create empty matrices to store the results

# In sample results
results_train_equal = np.asmatrix(np.zeros(shape = (25, 2)))
results_train_min_var = np.asmatrix(np.zeros(shape = (25, 2)))
results_train_factor = np.asmatrix(np.zeros(shape = (25, 2)))

# Out of sample results
results_test_equal = np.asmatrix(np.zeros(shape = (25, 2)))
results_test_min_var = np.asmatrix(np.zeros(shape = (25, 2)))
results_test_factor = np.asmatrix(np.zeros(shape = (25, 2)))

# Monthly returns (out of sample)
monthly_returns_equal = np.asmatrix(np.zeros(shape = (12, 25)))
monthly_returns_min_var = np.asmatrix(np.zeros(shape = (12, 25)))
monthly_returns_factor = np.asmatrix(np.zeros(shape = (12, 25)))

num_min_var = np.asmatrix(np.zeros(shape = (25, 1)))
num_factor = np.asmatrix(np.zeros(shape = (25, 1)))

In [101]:
# Get data
factors = read_factors()
stocks = read_stocks()

# Number of stocks
n = stocks.shape[1]

# Calculate the equal weights vector
x_equal = get_equal_weights(n)

for i in range(25):
    
    # Set the period starting point
    start_year = 1986 + i
    
    # Get train, test split
    training, testing = train_test_split(stocks, start_year)
    train_factor, test_factor = train_test_split(factors, start_year)
    
    # 1. Calculate the risk, returns for naive approach (annual and monthly)
    results_train_equal[i,:] = risk_return(x_equal, training)
    results_test_equal[i,:] = risk_return(x_equal, testing)
    
    ret_equal = results_train_equal[i,0]
    
    # 2. Estimate the risk, returns from the training data
    r_hat, Sigma = ret_cov_est(training)
    
    # 3. Calulate the factor loadings, factor covariance matrix, and 
    # idiosyncratic risk from the training data
    F, Sigma_tilde, D = fama_and_french(stocks, factors)
    
    # 4. Get the optimized weights
    # Note: I am using the naive portfolio for theshholds
    x_min_var = min_variance(r_hat = r_hat, Sigma = Sigma, tau = TAU, mu = ret_equal)
    x_factor = min_variance_factor(r_hat, F, Sigma_tilde, D, mu = ret_equal)
    
    # 5. Calculate the risk, returns for the optimized approachs
    results_train_min_var[i,:] = risk_return(x_min_var, training)
    results_test_min_var[i,:] = risk_return(x_min_var, testing)
    results_train_factor[i,:] = risk_return(x_factor, training)
    results_test_factor[i,:] = risk_return(x_factor, testing)
    
    # Calculate the monthly returns for each approach
    monthly_returns_equal[:,i] = testing.values * x_equal
    monthly_returns_min_var[:,i] = testing.values * x_min_var
    monthly_returns_factor[:,i] = testing.values * x_factor
    
    # Store the number of stocks selected for record keeping purposes
    num_min_var[i,0] = sum(x_min_var > 0)
    num_factor[i,0] = sum(x_factor > 0)

In [110]:
# Write out monthly results
def write_monthly(m, file_name):
    df = pd.DataFrame(m, columns=np.arange(1986, 2011) + 5)
    df.to_csv(file_name, index = False)
    
write_monthly(monthly_returns_equal, 'monthly_equal.csv')
write_monthly(monthly_returns_min_var, 'monthly_min_var.csv')
write_monthly(monthly_returns_factor, 'monthly_factor.csv')

In [13]:
# Format and write the results
results = [format_results(results_train_equal, 0, 'Equal', stocks.shape[1])
          , format_results(results_train_min_var, 0, 'Mean-Variance', num_min_var)
           , format_results(results_train_factor, 0, 'Factor', num_factor)
          , format_results(results_test_equal, 1, 'Equal', stocks.shape[1])
          , format_results(results_test_min_var, 1, 'Mean-Variance', num_min_var)
          , format_results(results_test_factor, 1, 'Factor', num_factor)]
results = pd.concat(results)

In [14]:
results.head()

Unnamed: 0,Return,Risk,Sharpe,strategy,year,test,period,num_stocks
0,0.010598,0.003017,3.512793,Equal,1986,0,1,556.0
1,0.014342,0.00303,4.733486,Equal,1987,0,1,556.0
2,0.016844,0.001467,11.481667,Equal,1988,0,1,556.0
3,0.01687,0.001372,12.292484,Equal,1989,0,1,556.0
4,0.013523,0.001395,9.697108,Equal,1990,0,1,556.0


In [18]:
results.to_csv('../../../data/factor_results.csv', index=False)

In [20]:
# Make table with the summarized results
test_summary = results.loc[results.test == 1,:].groupby(['period', 'strategy'])
test_summary = test_summary.mean().loc[:,['Return', 'Risk', 'Sharpe']]
test_summary.unstack(level = 1)

Unnamed: 0_level_0,Return,Return,Return,Risk,Risk,Risk,Sharpe,Sharpe,Sharpe
strategy,Equal,Factor,Mean-Variance,Equal,Factor,Mean-Variance,Equal,Factor,Mean-Variance
period,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,0.019642,0.008105,0.01174,0.000722,0.00037,0.000312,36.392484,25.995459,44.900731
2,0.015121,0.008974,0.015265,0.001595,0.000965,0.000947,13.404485,8.883195,17.569605
3,0.015015,0.011059,0.016316,0.001705,0.001131,0.00087,12.163081,17.808607,21.114091
4,0.010098,0.004215,0.00542,0.003549,0.001294,0.001211,6.893818,10.208853,11.299365
5,0.009719,0.003903,0.013156,0.001445,0.000739,0.000762,11.065556,6.71581,23.768179
