<Header> This is is where we develop a baseline factor model that allows us to take certain factors and use them in multi-OLS regression to predict our excess return signals on a security level. These signals are absolute. </Header>

In [1]:
#First, import our packages for the database connection and dataframe access
import mysql.connector
import pandas as pd
import statsmodels.api as sm
import numpy as np
import warnings
from datetime import datetime, timedelta

In [2]:
#Ignore warnings and set max row display option
warnings.filterwarnings("ignore")
pd.set_option('display.min_rows', 300)

In [3]:
# Connect to MySQL database
connection = mysql.connector.connect(
    host="ubctg.con7266gcvin.us-east-2.rds.amazonaws.com",
    user="admin",
    password="ubctgquant",
    database="ubctg"
)

In [4]:
#Define all params

start_date = '2017-12-01'
end_date = '2021-01-31'

factor_list = ["Consumer_Sentiment", "Unemployment_Rate"]
date_col_name = ["DATE", "date"]

In [5]:
def execute_query(sql_query):
    
    cursor = connection.cursor()
    
    # Execute the SQL query
    cursor.execute(sql_query)

    # Fetch all rows from the result set
    universe_data = cursor.fetchall()

    # Convert fetched data into a pandas DataFrame
    columns = [i[0] for i in cursor.description]  # Extract column names from cursor description

    #create new df
    universe_df = pd.DataFrame(universe_data, columns=columns)

    #close cursor and db connection
    cursor.close()
    
    return universe_df


Here, we pull monthly returns across the stock universe over 10 years from 2011 to 2021

In [6]:
universe_df = execute_query(f"SELECT * FROM `Monthly Returns` mr WHERE Date BETWEEN '{start_date}' AND '{end_date}'")

Generate the number of observations per security in our table. This way we can remove those that do not have a full dataset

In [7]:
observation_df = execute_query(f"SELECT COUNT(PERMNO), PERMNO FROM `Monthly Returns` mr WHERE Date BETWEEN '{start_date}' AND '{end_date}' GROUP BY PERMNO")

Here, we remove securities that do not have enough observations (in our case, we look for at least 120)

In [8]:
#Filter securities with at least median observations in the period. This is not a great way to eliminate observations, so I would be cautious in doing this. There are better ways to filter out insufficient data
observation_df_filtered = observation_df[pd.to_numeric(observation_df['COUNT(PERMNO)']) >= int(pd.to_numeric(observation_df['COUNT(PERMNO)']).median())]

#Inner join our dataframes to only keep the securities we have data on
universe_df_filtered = pd.merge(universe_df, observation_df_filtered, on='PERMNO', how='inner')

In [9]:
observation_df_filtered

Unnamed: 0,COUNT(PERMNO),PERMNO
0,38,10026
1,38,10028
2,38,10032
3,38,10044
4,38,10051
5,38,10065
6,38,10066
7,38,10104
8,38,10107
9,38,10113


We can now add our consumer sentiment factor to our model. We then join it to our monthly returns dataframe for our OLS regression

In [10]:
#These dates are at the beginning of month, so we will operate on our monthly return dataframe to convert our dates to the beginning of the month so that we can append on index
universe_df_filtered["date"] = pd.to_datetime(universe_df_filtered["date"]).dt.to_period('M').dt.to_timestamp()

universe_df_with_external_factors = universe_df_filtered.copy(deep = True)
factor_col_names = []
for i in range(0, len(factor_list)):
    factor = factor_list[i]
    date_col = date_col_name[i]
    
    factor_df = execute_query(f"SELECT * FROM `{factor}` mr WHERE Date BETWEEN '{start_date}' AND '{end_date}'")
    factor_df = factor_df.rename(columns = {date_col : "date"})
    factor_df["date"] = pd.to_datetime(factor_df["date"])
    #factor_df = factor_df.drop(date_col, axis=1)
    
    # Get column names for all factors
    col_names = list(factor_df.columns.values) 
    col_names.remove("date")
    factor_col_names += col_names 
    
    #Inner-join macro factors dataframe with universe dataframe, using 'date' column as index
    universe_df_with_external_factors = pd.merge(universe_df_with_external_factors, factor_df, on= 'date', how='inner')

display(universe_df_with_external_factors)

Unnamed: 0,PERMNO,date,COMNAM,NAICS,PERMCO,DCLRDT,DLPDT,PAYDT,RCRDDT,FACPR,...,ACCOMP,DLPRC,PRC,VOL,RET,BID,ASK,COUNT(PERMNO),UMCSENT,UNRATE
0,10026,2017-12-01,J & J SNACK FOODS CORP,311821,7976,2017-11-29,,2018-01-04,2017-12-13,0.0,...,,,151.83000,11819.0,0.007743,151.69000,151.83000,38,95.9,4.1
1,10026,2018-01-01,J & J SNACK FOODS CORP,311821,7976,,,,,,...,,,138.44000,17680.0,-0.088191,138.33000,138.44000,38,95.7,4.0
2,10026,2018-02-01,J & J SNACK FOODS CORP,311821,7976,,,,,,...,,,134.33000,16663.0,-0.029688,134.33000,134.53999,38,99.7,4.1
3,10026,2018-03-01,J & J SNACK FOODS CORP,311821,7976,2018-02-12,,2018-04-04,2018-03-15,0.0,...,,,136.56000,14156.0,0.019951,136.50000,136.59000,38,101.4,4.0
4,10026,2018-04-01,J & J SNACK FOODS CORP,311821,7976,,,,,,...,,,137.41000,12510.0,0.006224,137.44000,137.61000,38,98.8,4.0
5,10026,2018-05-01,J & J SNACK FOODS CORP,311821,7976,,,,,,...,,,141.62000,15898.0,0.030638,141.67000,141.91000,38,98,3.8
6,10026,2018-06-01,J & J SNACK FOODS CORP,311821,7976,2018-05-17,,2018-07-05,2018-06-14,0.0,...,,,152.47000,15552.0,0.079791,152.47000,152.78000,38,98.2,4.0
7,10026,2018-07-01,J & J SNACK FOODS CORP,311821,7976,,,,,,...,,,144.96001,15575.0,-0.049256,143.99001,144.97000,38,97.9,3.8
8,10026,2018-08-01,J & J SNACK FOODS CORP,311821,7976,,,,,,...,,,145.50000,13327.0,0.003725,145.44000,145.50000,38,96.2,3.8
9,10026,2018-09-01,J & J SNACK FOODS CORP,311821,7976,2018-08-29,,2018-10-04,2018-09-14,0.0,...,,,150.89000,13237.0,0.040137,150.78000,150.89000,38,100.1,3.7


In [11]:
factor_col_names

['UMCSENT', 'UNRATE']

In [12]:
#Ensure no errors in the returns column (there have been some instances where returns have taken on non-numeric values)
universe_df_with_external_factors["RET"] = pd.to_numeric(universe_df_with_external_factors["RET"], errors="coerce")

#Here, we select our required columns in proper order (see function documentation for more info) and get rid of extra columns that we will not use
universe_df_with_external_factors_filtered = universe_df_with_external_factors[["PERMNO","date","RET"] + factor_col_names]

#Remove duplicate rows from universe (monthly returns) dataframe to avoid faulty data 
universe_df_with_external_factors_filtered = universe_df_with_external_factors_filtered.drop_duplicates()

#Convert consumer sentiment type to float
for factor in factor_col_names:
    universe_df_with_external_factors_filtered[factor] = universe_df_with_external_factors_filtered[factor].astype(float)

display(universe_df_with_external_factors_filtered)

Unnamed: 0,PERMNO,date,RET,UMCSENT,UNRATE
0,10026,2017-12-01,0.007743,95.9,4.1
1,10026,2018-01-01,-0.088191,95.7,4.0
2,10026,2018-02-01,-0.029688,99.7,4.1
3,10026,2018-03-01,0.019951,101.4,4.0
4,10026,2018-04-01,0.006224,98.8,4.0
5,10026,2018-05-01,0.030638,98.0,3.8
6,10026,2018-06-01,0.079791,98.2,4.0
7,10026,2018-07-01,-0.049256,97.9,3.8
8,10026,2018-08-01,0.003725,96.2,3.8
9,10026,2018-09-01,0.040137,100.1,3.7


In [13]:
#Defining the factor model that takes a dataframe of the required columns (unique identifier, date/index column, returns columns, and a set of factors)
def olsfactormodel(df_attached, lookbackwindow:int, returnsvec:bool):

    #Assign passed dataframe to new dataframe
    df = df_attached.copy(deep=True)
    
    #We generate our list of unique tickers using the column in the unique identifier position (0, or leftmost column)
    uniqueTickerList = df.iloc[:,0].unique()
    
    #Initialize large df to drop results of regression for each security at each regression date
    containerdf = pd.DataFrame()
                
    #Set our lookback window to 24 periods (months in this case). This means that we will run a regression for each period (after the first 24 months) using the previous 24 months as data
    LookBack_Window=lookbackwindow
    
    #We will now create a new set of columns for the Beta and P-value for each of our factors. We will do this by iterating through each factor and columns for the beta coefficients
    factorlist = list(df.columns[3:])
    
    #For each factor, define an empty column to hold the corresponding coefficient
    for factor in factorlist: 
        df["Beta_" + factor] = 0


    #For each factor, define an empty column to hold the corresponding t-stat/p-value
    for factor in factorlist: 
        df["T-Stat_" + factor] = 0
    
    #Initialize global parameter(s) regardless of factor count
    df["R_squared"] = 0
    df["Constant B0"] = 0

    #For each identifier (ticker), generate a dataframe from the broader dataframe that 
    for ticker in uniqueTickerList:
        
        #For each unique identifier (ticker), we create a dataframe with observations from that particular identifier
        ticker_specific_universe_df = df.loc[df.iloc[:,0] == ticker]
        
        #We then sort our date/index column in position 1 to ensure our date is ascending from the earliest available
        ticker_specific_universe_df = ticker_specific_universe_df.sort_values(by= ticker_specific_universe_df.columns[1])

        #Shift our returns data back by one period. This way, we regress "t" factors to "t+1" returns, and our betas become forecasts
        ticker_specific_universe_df['RET'] = ticker_specific_universe_df['RET'].shift(-1)

        if returnsvec == True:
            ticker_specific_universe_df = ticker_specific_universe_df.tail(LookBack_Window+1)

        #For each lookback window span, train an OLS and collect the results
        for x in range(0, (len(ticker_specific_universe_df)-LookBack_Window)):
            
            # Define the independent variables (X) and dependent variable (Y). X's are defined by our factor columns and our returns are defined in our third left-most column (position 2)
            X = ticker_specific_universe_df[factorlist][x:x+LookBack_Window]
            Y = ticker_specific_universe_df[ticker_specific_universe_df.columns[2]][x:x+LookBack_Window]
    
            #Add a constant term to the independent variables, check impact
            X = sm.add_constant(X)
        
            #Fit the linear regression model
            model = sm.OLS(Y, X)
            results = model.fit()
            
            #Place our regression coefficients into their appropriate columns
            ticker_specific_universe_df.loc[ticker_specific_universe_df.index[x+LookBack_Window], "Constant B0"] = results.params[0]
            ticker_specific_universe_df.loc[ticker_specific_universe_df.index[x+LookBack_Window], "R_squared"] = results.rsquared
    
            #We do the same dynamically using our list of factors and placing into the appropriate factor column
            #Define an index (starting position for inserting regression params)
            i = 3 + len(factorlist) - 1
            
            for index in range(len(factorlist)):
                #increase our indices by one for each factor
                index+=1
                i+=1
                j = i+len(factorlist)
                
                ticker_specific_universe_df.loc[ticker_specific_universe_df.index[x+LookBack_Window], ticker_specific_universe_df.columns[i]] = results.params[index]
                ticker_specific_universe_df.loc[ticker_specific_universe_df.index[x+LookBack_Window], ticker_specific_universe_df.columns[j]] = results.pvalues[index]
         
        #Add ticker dataframe to larger container dataframe
        if returnsvec==1:
            containerdf = pd.concat([containerdf, ticker_specific_universe_df[ticker_specific_universe_df["date"]==np.max(ticker_specific_universe_df.date)]], ignore_index=True)
        else:
            containerdf = pd.concat([containerdf, ticker_specific_universe_df], ignore_index=True)        
    return containerdf

In [18]:
returnsdf = olsfactormodel(universe_df_with_external_factors_filtered,34,False)

# Takes 1 min 34 sec with look_back_window = 34

KeyboardInterrupt: 

In [19]:
returnsdf

Unnamed: 0,PERMNO,date,RET,UMCSENT,UNRATE,Beta_UMCSENT,Beta_UNRATE,T-Stat_UMCSENT,T-Stat_UNRATE,R_squared,Constant B0
0,10026,2017-12-01,-0.088191,95.9,4.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,10026,2018-01-01,-0.029688,95.7,4.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,10026,2018-02-01,0.019951,99.7,4.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,10026,2018-03-01,0.006224,101.4,4.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,10026,2018-04-01,0.030638,98.8,4.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,10026,2018-05-01,0.079791,98.0,3.8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,10026,2018-06-01,-0.049256,98.2,4.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,10026,2018-07-01,0.003725,97.9,3.8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,10026,2018-08-01,0.040137,96.2,3.8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,10026,2018-09-01,0.034926,100.1,3.7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [21]:
returnsdf.iloc[35, :]

PERMNO                          10026
date              2020-11-01 00:00:00
RET                          0.072598
UMCSENT                          76.9
UNRATE                            6.7
Beta_UMCSENT                 -0.00348
Beta_UNRATE                 -0.007922
T-Stat_UMCSENT                0.25249
T-Stat_UNRATE                0.407135
R_squared                    0.050323
Constant B0                  0.368231
Name: 35, dtype: object