In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import inv
import sklearn
import matplotlib.pyplot as plt

In [2]:
# linear regression methodology
class LR:
    
    def fit(self, X_train, y_train):
        # create vector of ones...
        ones = np.ones(shape=len(X_train))[..., None]
        #...and add to feature matrix
        X = np.concatenate((ones, X_train), 1)
        #calculate coefficients using closed-form solution
        self.coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y_train)
        
    def predict(self, X_test):
        ones = np.ones(shape=len(X_test))[..., None]
        X_test = np.concatenate((ones, X_test), 1)
        y_hat = X_test.dot(self.coeffs)
        return y_hat


In [16]:
# cross validation methodology
def k_fold(k, df):
    n = len(df)
    cut = int(n/k)
    folds = []
    start = 0
    end = cut
    for i in range(0, k):
        fold = df[start: end]
        folds.append(fold)
        start += int(n/k)
        end += int(n/k)
    return folds

def mse(actual, predicted):
    return -(((actual - predicted)**2).mean())

def cv(folds, response):
    test_errors = []
    for i in range(0, len(folds)):
        X_train = pd.DataFrame()
        Y_train = pd.Series(dtype=float)
        for j in range(0, len(folds)):
            if i == j:
                X_test = folds[j].drop([response], axis=1)
                Y_test = folds[j][response]
            if i != j:
                X_train = X_train.append(folds[j].drop([response], axis=1))
                Y_train = Y_train.append(folds[j][response])
    
        model = LR()
        model.fit(X_train, Y_train.transpose())
        Y_pred = pd.DataFrame()
        Y_pred = model.predict(X_test)
        error = mse(Y_test, Y_pred) 
        test_errors.append(error)
        rmse = np.sqrt(-sum(test_errors)/len(folds))
        rss = (Y_test - Y_pred)**2
        rss = rss.sum()
        n = len(folds[j])
        p = len(X_train.columns)
        BIC = p * np.log(n) + n * np.log(rss/n)
    return rmse, BIC

In [8]:
# Standardizing the Dataframe

def standardize(df): 
    #standardize only quantitative variables
    df_st = ((df.select_dtypes(float) - df.select_dtypes(float).mean()) / df.select_dtypes(float).std()) 

    #join the standardized quantites back with original df 
    df_st = df.select_dtypes(exclude=float).join(df_st)
    return df_st 

In [9]:
df = pd.read_csv("iowa_month_county.csv") 
df.set_index(["County", "Month-Year"], inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pack,State Bottle Retail,Population,Volume Sold (Gallons) Per Capita,Income Per Capita,Precincts,Votes,Republicans 2016,Democrats 2016,Green 2016,...,Black,Hispanic,Asian,Amerindian,Other,Median Age,Teen.births,Sexually.transmitted.infections,Unemployment,Violent.crime
County,Month-Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adair,01-2012,354,369.0,7468,0.009092,42093,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,02-2012,395,524.48,7468,0.009487,42093,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,03-2012,320,284.23,7468,0.005108,42093,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,04-2012,510,458.76,7468,0.009859,42093,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,05-2012,430,587.6,7468,0.010932,42093,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19


In [42]:
# 1 predictor: Pack
df_test = df[["Pack","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
PCK = out[1]

rmse: 0.9931250435879557
     
BIC: -1089.379654239777


In [43]:
# 1 predictor: State Bottle Retail
df_test = df[["State Bottle Retail","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
SBR = out[1]

rmse: 1.0147376266442851
     
BIC: -1100.550282332577


In [44]:
# 1 predictor: Population
df_test = df[["Population","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
PPLN = out[1]

rmse: 0.9803072999709112
     
BIC: -997.8025275470935


In [45]:
# 1 predictor: Income per capita
df_test = df[["Income Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
IPC = out[1]

rmse: 1.0046163613540184
     
BIC: -747.322570246465


In [46]:
# 1 predictor: Precincts
df_test = df[["Precincts","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
PRCT = out[1]

rmse: 0.9791782383177384
     
BIC: -1007.5794563205553


In [47]:
# 1 predictor: Votes
df_test = df[["Votes","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
VOTE = out[1]

rmse: 0.9779765465219404
     
BIC: -990.0870929517489


In [48]:
# 1 predictor: Republicans 2016
df_test = df[["Republicans 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
RPB = out[1]

rmse: 0.9695591820343095
     
BIC: -1010.8328624992433


In [49]:
# 1 predictor: Democrats 2016
df_test = df[["Democrats 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
DMC = out[1]

rmse: 0.9656914257785936
     
BIC: -1024.9519346794254


In [50]:
# 1 predictor: Green 2016
df_test = df[["Green 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
GRN = out[1]

rmse: 1.0295210353695152
     
BIC: -677.0933235600629


In [51]:
# 1 predictor: Libertarians 2016
df_test = df[["Libertarians 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
LBR = out[1]

rmse: 1.0129082534079275
     
BIC: -633.7540720894933


In [52]:
# 1 predictor: At Least High School Diploma
df_test = df[["At Least High School Diploma","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
HSD = out[1] 

rmse: 0.9882379329866818
     
BIC: -773.86745627841


In [53]:
# 1 predictor: At Least Bachelors's Degree
df_test = df[["At Least Bachelors's Degree","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
BD = out[1]

rmse: 0.9636097965659541
     
BIC: -870.1873635624872


In [54]:
# 1 predictor: School Enrollment
df_test = df[["School Enrollment","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
SE = out[1]

rmse: 1.0088735821379569
     
BIC: -627.7869859947737


In [55]:
# 1 predictor: Median Earnings 2010
df_test = df[["Median Earnings 2010","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
ME = out[1]

rmse: 1.003198925749258
     
BIC: -683.8031488398211


In [56]:
# 1 predictor: Poverty.Rate.below.federal.poverty.threshold
df_test = df[["Poverty.Rate.below.federal.poverty.threshold","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
PR = out[1]

rmse: 1.0044321172083146
     
BIC: -700.024319293957


In [57]:
# 1 predictor: White
df_test = df[["White","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
WHT = out[1]

rmse: 0.9884284113760441
     
BIC: -824.7969153654225


In [58]:
# 1 predictor: Black
df_test = df[["Black","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
BLCK = out[1]

rmse: 0.9527654108797137
     
BIC: -1117.3197110580072


In [60]:
# 1 predictor: Hispanic
df_test = df[["Hispanic","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
HSP = out[1]

rmse: 1.0039054783656416
     
BIC: -702.0821818600314


In [61]:
# 1 predictor: Median age
df_test = df[["Median Age","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
MA = out[1]

rmse: 1.0000476606297906
     
BIC: -818.2722493807632


In [62]:
# 1 predictor: Unemployment
df_test = df[["Unemployment","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])
UE = out[1]

rmse: 1.0039551852338024
     
BIC: -670.6502939527218


In [65]:
data = {"Predictors": ["Pack","State Bottle Retail","Population","Income Per Capita","Precincts", "Votes",
                      "Republicans 2016", "Democrats 2016", "Green 2016", "Libertarian 2016",
                       "At least high school diploma", "At least bachelors degree","school enrollment",
                      "median earnings", "poverty rate","white", "black","hispanic", "median earnings","unemployment"],
       "BIC": [PCK,SBR,PPLN,IPC, PRCT, VOTE, RPB,DMC, GRN, LBR,HSD, BD, SE,ME,PR, WHT,BLCK,HSP,MA, UE],}
one_predictor = pd.DataFrame(data, columns = ["Predictors", "BIC"])
one_predictor

Unnamed: 0,Predictors,BIC
0,Pack,-1089.379654
1,State Bottle Retail,-1100.550282
2,Population,-997.802528
3,Income Per Capita,-747.32257
4,Precincts,-1007.579456
5,Votes,-990.087093
6,Republicans 2016,-1010.832862
7,Democrats 2016,-1024.951935
8,Green 2016,-677.093324
9,Libertarian 2016,-633.754072


In [66]:
# 2 predictors: Unemployment, School Enrollment
df_test = df[["Unemployment","School Enrollment","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.01082297886988
     
BIC: -600.9435936438922


In [69]:
# 2 predictors: Unemployment, Green
df_test = df[["Unemployment","Green 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0298012943431492
     
BIC: -663.7212789387083


In [70]:
# 2 predictor: Unemployment, Libertarians
df_test = df[["Unemployment","Libertarians 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0150510399398829
     
BIC: -619.119876679664


In [71]:
# 2 predictors: Unemployment, Income Per CApita
df_test = df[["Unemployment","Income Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0033596325643706
     
BIC: -788.101477812162


In [74]:
# 2 predictor: Unemployment, Poverty rate
df_test = df[["Unemployment","Poverty.Rate.below.federal.poverty.threshold","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0070637622640102
     
BIC: -674.7842028741434


In [81]:
# 2 predictor: Unemployment, Votes
df_test = df[["Unemployment","Votes","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 0.9798149325549946
     
BIC: -973.4770752311853


# Best 2 predictor model: Unemployment, School Enrollment

In [77]:
# 3 predictors: Unemployment, School Enrollment, Libertarian
df_test = df[["Unemployment","School Enrollment","Libertarians 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0187539627257054
     
BIC: -567.0184725472167


In [78]:
# 3 predictors: Unemployment, School Enrollment, Green
df_test = df[["Unemployment","School Enrollment","Green 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.035331992296831
     
BIC: -597.2161312407978


In [79]:
# 3 predictors: Unemployment, School Enrollment, Income Per Capita
df_test = df[["Unemployment","School Enrollment","Income Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0097936949532549
     
BIC: -714.7827962979886


In [80]:
# 3 predictors: Unemployment, School Enrollment, Poverty Rate
df_test = df[["Unemployment","School Enrollment","Poverty.Rate.below.federal.poverty.threshold","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0153374849781034
     
BIC: -599.3055155552553


# Best 3 Predictor Model: Unemployment, School Enrollment, Libertarian

In [82]:
# 4 predictors: Unemployment, School Enrollment, Libertarian, Green
df_test = df[["Unemployment","School Enrollment","Libertarians 2016","Green 2016","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0474813154414342
     
BIC: -563.9813885006845


In [83]:
# 4 predictors: Unemployment, School Enrollment, Libertarian, Income Per capita
df_test = df[["Unemployment","School Enrollment","Libertarians 2016","Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0231361457927983
     
BIC: -600.7886867209256


In [84]:
# 4 predictors: Unemployment, School Enrollment, Libertarian, Poverty Rate
df_test = df[["Unemployment","School Enrollment","Libertarians 2016","Poverty.Rate.below.federal.poverty.threshold",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.021836768434966
     
BIC: -575.9351005115527


# Best 4 predictor model: Unemployment, School Enrollment, Libertarian, Green

In [86]:
# 5 predictors: Unemployment, School Enrollment, Libertarian, Green, Income Per capita
df_test = df[["Unemployment","School Enrollment","Libertarians 2016","Green 2016","Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0683791022542632
     
BIC: -625.6250962756321


In [87]:
# 5 predictors: Unemployment, School Enrollment, Libertarian, Green, Poverty rate
df_test = df[["Unemployment","School Enrollment","Libertarians 2016","Green 2016","Poverty.Rate.below.federal.poverty.threshold",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.053142027711145
     
BIC: -557.2540862527572


In [88]:
# 6 predictors: Unemployment, School Enrollment, Libertarian, Green, Poverty rate
df_test = df[["Unemployment","School Enrollment","Libertarians 2016","Green 2016","Poverty.Rate.below.federal.poverty.threshold",
              "Income Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("rmse:", out[0])
print("     ")
print("BIC:", out[1])

rmse: 1.0490313120698354
     
BIC: -702.6874145850519


# Most Interpretable model:  Unemployment, School Enrollment, Libertarian, Green, Poverty rate