In [2]:
import pandas as pd
import numpy as np
from numpy.linalg import inv
import sklearn
import matplotlib.pyplot as plt

In [3]:
# linear regression methodology
class LR:
    
    def fit(self, X_train, y_train):
        # create vector of ones...
        ones = np.ones(shape=len(X_train))[..., None]
        #...and add to feature matrix
        X = np.concatenate((ones, X_train), 1)
        #calculate coefficients using closed-form solution
        self.coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y_train)
        
    def predict(self, X_test):
        ones = np.ones(shape=len(X_test))[..., None]
        X_test = np.concatenate((ones, X_test), 1)
        y_hat = X_test.dot(self.coeffs)
        return y_hat


In [4]:
# cross validation methodology
def k_fold(k, df):
    n = len(df)
    cut = int(n/k)
    folds = []
    start = 0
    end = cut
    for i in range(0, k):
        fold = df[start: end]
        folds.append(fold)
        start += int(n/k)
        end += int(n/k)
    return folds

def mse(actual, predicted):
    return -(((actual - predicted)**2).mean())

def cv(folds, response):
    test_errors = []
    for i in range(0, len(folds)):
        X_train = pd.DataFrame()
        Y_train = pd.Series(dtype=float)
        for j in range(0, len(folds)):
            if i == j:
                X_test = folds[j].drop([response], axis=1)
                Y_test = folds[j][response]
            if i != j:
                X_train = X_train.append(folds[j].drop([response], axis=1))
                Y_train = Y_train.append(folds[j][response])
    
        model = LR()
        model.fit(X_train, Y_train.transpose())
        Y_pred = pd.DataFrame()
        Y_pred = model.predict(X_test)
        error = mse(Y_test, Y_pred) 
        test_errors.append(error)
        rmse = np.sqrt(-sum(test_errors)/len(folds))
    return test_errors, rmse

In [5]:
def standardize(df): 
    #standardize only quantitative variables
    df_st = ((df.select_dtypes(float) - df.select_dtypes(float).mean()) / df.select_dtypes(float).std()) 

    #join the standardized quantites back with original df 
    df_st = df.select_dtypes(exclude=float).join(df_st)
    return df_st 

In [11]:
df = pd.read_csv("iowa_month_county.csv") 
df.set_index(["County", "Month-Year"], inplace=True)
df["Pack"] = df["Pack"].astype(float)
df["Population"] = df["Population"].astype(float)
df["Income Per Capita"] = df["Income Per Capita"].astype(float)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pack,State Bottle Retail,Population,Volume Sold (Gallons) Per Capita,Income Per Capita,Precincts,Votes,Republicans 2016,Democrats 2016,Green 2016,...,Black,Hispanic,Asian,Amerindian,Other,Median Age,Teen.births,Sexually.transmitted.infections,Unemployment,Violent.crime
County,Month-Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
adair,01-2012,354.0,369.0,7468.0,0.009092,42093.0,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,02-2012,395.0,524.48,7468.0,0.009487,42093.0,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,03-2012,320.0,284.23,7468.0,0.005108,42093.0,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,04-2012,510.0,458.76,7468.0,0.009859,42093.0,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19
adair,05-2012,430.0,587.6,7468.0,0.010932,42093.0,6.0,3759.0,65.336526,29.981378,0.345837,...,0.05,0.95,0.25,0.1,0.65,44.8,26.4,146.1,0.042,48.19


In [13]:
df = standardize(df)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df)
out = cv(folds, response)
print("rmse:", out[1])

rmse: 1.04757499860355


After performing Backwards stepwise, the rmse didn't get any lower.  So, our "most predictive model" is the model above with all features included. 