In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import inv
import sklearn
import matplotlib.pyplot as plt

In [2]:
# linear regression methodology
class LR:
    
    def fit(self, X_train, y_train):
        # create vector of ones...
        ones = np.ones(shape=len(X_train))[..., None]
        #...and add to feature matrix
        X = np.concatenate((ones, X_train), 1)
        #calculate coefficients using closed-form solution
        self.coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y_train)
        
    def predict(self, X_test):
        ones = np.ones(shape=len(X_test))[..., None]
        X_test = np.concatenate((ones, X_test), 1)
        y_hat = X_test.dot(self.coeffs)
        return y_hat


In [3]:
# sklear linear regression example
from sklearn.linear_model import LinearRegression

data_dir = "https://dlsun.github.io/pods/data/"
bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
                          index_col="year")

bordeaux_train = bordeaux_df.loc[:1980].copy()
bordeaux_test = bordeaux_df.loc[1980:].copy()

X_train = bordeaux_train[["age"]]
X_test = bordeaux_test[["age"]]
y_train = bordeaux_train["price"]

model = LinearRegression()
model.fit(X=X_train, y=y_train)
model.predict(X=X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [4]:
# our linear regression example on same data
model = LR()
model.fit(X_train, y_train)
model.predict(X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [5]:
# cross validation methodology
def k_fold(k, df):
    n = len(df)
    cut = int(n/k)
    folds = []
    start = 0
    end = cut
    for i in range(0, k):
        fold = df[start: end]
        folds.append(fold)
        start += int(n/k)
        end += int(n/k)
    return folds

def mse(actual, predicted):
    return -(((actual - predicted)**2).mean())

def cv(folds, response):
    test_errors = []
    for i in range(0, len(folds)):
        X_train = pd.DataFrame()
        Y_train = pd.Series(dtype=float)
        for j in range(0, len(folds)):
            if i == j:
                X_test = folds[j].drop([response], axis=1)
                Y_test = folds[j][response]
            if i != j:
                X_train = X_train.append(folds[j].drop([response], axis=1))
                Y_train = Y_train.append(folds[j][response])
    
        model = LR()
        model.fit(X_train, Y_train.transpose())
        Y_pred = pd.DataFrame()
        Y_pred = model.predict(X_test)
        error = mse(Y_test, Y_pred) 
        test_errors.append(error)
        rmse = np.sqrt(-sum(test_errors)/len(folds))
    return test_errors, rmse

In [6]:
# sklearn cross validation example
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

data_dir = "https://dlsun.github.io/pods/data/"
bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
                          index_col="year")

bordeaux_train = bordeaux_df.loc[:1980].copy()
bordeaux_train["log(price)"] = np.log(bordeaux_train["price"])
bordeaux_train.drop(["price"], axis=1, inplace=True)

model = LinearRegression()
scores = cross_val_score(model, 
                         X=bordeaux_train[["win", "summer"]],
                         y=bordeaux_train["log(price)"],
                         scoring="neg_mean_squared_error",
                         cv=2)

print(scores)
print(np.sqrt(-scores).mean())

[-0.29898833 -0.27833086]
0.5371844859149083


In [7]:
# our cross validation on same data
bordeaux_train2 = bordeaux_train[["win", "summer", "log(price)"]]
folds = k_fold(2, bordeaux_train2)
errs, rmse = cv(folds, "log(price)")
print(errs)
print(rmse)


[-0.3261536583404905, -0.28766838705119696]
0.5539955078300217


In [8]:
# Standardizing the Dataframe

def standardize(df): 
    #standardize only quantitative variables
    df_st = ((df.select_dtypes(float) - df.select_dtypes(float).mean()) / df.select_dtypes(float).std()) 

    #join the standardized quantites back with original df 
    df_st = df.select_dtypes(exclude=float).join(df_st)
    return df_st 

In [10]:
df = pd.read_csv("iowa_month_county_main.csv") 
df.set_index(["County", "Month-Year"], inplace=True)

halfway = len(df)//2

df_train = df.iloc[:halfway].copy()
df_test = df.iloc[halfway:].copy()

In [11]:
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pack,State Bottle Cost,State Bottle Retail,Bottles Sold Per Capita,Alcohol Expense Per Capita,Volume Sold (Gallons) Per Capita,Consolidated_Brandy,Consolidated_Gin,Consolidated_Rum,Consolidated_Schnapps,Consolidated_Tequila,Consolidated_Vodka,Consolidated_Whiskey,Population,Income Per Capita
County,Month-Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Adair,01-2012,10.411765,7.212647,10.852941,0.000914,0.009719,0.000267,0.0,0.029412,0.088235,0.176471,0.0,0.235294,0.323529,7468,42093
Adair,02-2012,10.394737,9.180263,13.802105,0.000927,0.011195,0.00025,0.052632,0.105263,0.052632,0.105263,0.0,0.184211,0.394737,7468,42093
Adair,03-2012,12.307692,7.256923,10.931923,0.000917,0.009475,0.000196,0.076923,0.076923,0.153846,0.153846,0.0,0.153846,0.346154,7468,42093
Adair,04-2012,12.142857,7.265238,10.922857,0.000969,0.010872,0.000235,0.071429,0.166667,0.047619,0.047619,0.0,0.214286,0.309524,7468,42093
Adair,05-2012,10.238095,9.279286,13.990476,0.001011,0.013036,0.00026,0.02381,0.047619,0.0,0.047619,0.02381,0.285714,0.47619,7468,42093


In [12]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pack,State Bottle Cost,State Bottle Retail,Bottles Sold Per Capita,Alcohol Expense Per Capita,Volume Sold (Gallons) Per Capita,Consolidated_Brandy,Consolidated_Gin,Consolidated_Rum,Consolidated_Schnapps,Consolidated_Tequila,Consolidated_Vodka,Consolidated_Whiskey,Population,Income Per Capita
County,Month-Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Adair,01-2012,10.411765,7.212647,10.852941,0.000914,0.009719,0.000267,0.0,0.029412,0.088235,0.176471,0.0,0.235294,0.323529,7468,42093
Adair,02-2012,10.394737,9.180263,13.802105,0.000927,0.011195,0.00025,0.052632,0.105263,0.052632,0.105263,0.0,0.184211,0.394737,7468,42093
Adair,03-2012,12.307692,7.256923,10.931923,0.000917,0.009475,0.000196,0.076923,0.076923,0.153846,0.153846,0.0,0.153846,0.346154,7468,42093
Adair,04-2012,12.142857,7.265238,10.922857,0.000969,0.010872,0.000235,0.071429,0.166667,0.047619,0.047619,0.0,0.214286,0.309524,7468,42093
Adair,05-2012,10.238095,9.279286,13.990476,0.001011,0.013036,0.00026,0.02381,0.047619,0.0,0.047619,0.02381,0.285714,0.47619,7468,42093


In [13]:
y_train = df_train[["Volume Sold (Gallons) Per Capita"]]
x_train = df_train[["State Bottle Cost"]]
x_test = df_train[["State Bottle Cost"]]
model_sk = LinearRegression()
model.fit(x_train, y_train)
model.predict(y_train)[0:5]

array([[0.0001966],
       [0.0001966],
       [0.0001966],
       [0.0001966],
       [0.0001966]])

In [33]:
# 2 predictor: Alcohol Type , Pack
df_test = df[["Pack","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
two_pr1 = out[1]

test errors: [-0.9254867554495261, -1.2307674008360305, -0.8498307518312769, -1.0110126386638136, -0.7771119786875998]
     
rmse: 0.9792047309391685


In [15]:
# 2 predictor: Alcohol Type , Population
df_test = df[["Population","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-0.8773694342182153, -0.8631081291213494, -0.7725189018451611, -1.6866140664201947, -0.6821979193933909]
     
rmse: 0.9881101609636763


In [34]:
# 2 predictor: Alcohol Type ,State Bottle Cost
df_test = df[["State Bottle Cost","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
two_pr2 = out[1]

test errors: [-0.9537573536958859, -0.9670576108582702, -0.9556266713260116, -1.1068673321201836, -0.8545164094117057]
     
rmse: 0.9836488578158425


In [18]:
# 2 predictor: Alcohol Type , Income Per Capita
df_test = df[["Income Per Capita","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-0.9543057367481765, -1.0311798863230497, -0.9239821870772892, -1.1106637354843147, -0.8544814985412479]
     
rmse: 0.9873816935890678


# 3 predictors

In [19]:
# 3 predictor: Alcohol Type , pack, population
df_test = df[["Pack","Population","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-0.8589121089433139, -1.0304628871696424, -0.7298311503668754, -1.5752743765236101, -0.643924348708762]
     
rmse: 0.9837077687720275


In [35]:
# 3 predictor: Alcohol Type , pack, state bottle cost
df_test = df[["Pack","State Bottle Cost","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
three_pr1 = out[1]

test errors: [-0.8997131749854642, -1.1926797978808583, -0.8629289520893049, -0.9731970048233415, -0.7646743190782398]
     
rmse: 0.968833654334655


In [22]:
# 3 predictor: Alcohol Type , pack, income per capita
df_test = df[["Pack","Income Per Capita","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-0.9250963099851784, -1.2521439823587541, -0.8463275106634496, -1.0098546470821597, -0.7767475879032553]
     
rmse: 0.980833323046561


# 4 Predictors 

In [23]:
# 4 predictor: Alcohol Type , pack, state bottle cost, population
df_test = df[["Pack","State Bottle Cost","Population","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-0.8461424802584689, -1.01506291889793, -0.7521404432129077, -1.5285517080241617, -0.6397305370500027]
     
rmse: 0.9779190239936506


In [36]:
# 4 predictor: Alcohol Type , pack, state bottle cost, income per capita
df_test = df[["Pack","State Bottle Cost","Income Per Capita","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
four_pr1 = out[1]

test errors: [-0.8927918873580926, -1.2110895070212322, -0.8556926371410782, -0.9701630277634348, -0.765980931612602]
     
rmse: 0.9690942153265016


# 5 predictors

In [37]:
# 5 predictor: Alcohol Type , pack, state bottle cost, income per capita
df_test = df[["Pack","State Bottle Cost","Income Per Capita","Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Population","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
five_pr = out[1]

test errors: [-0.8313726962489184, -1.0384801081593373, -0.7369555498648784, -1.5480487509304177, -0.6382749340559335]
     
rmse: 0.9790946878887135


In [32]:
data = {"Predictors": ["State Bottle Cost", "Pack","Population","Income Per Capita", "Alcohol Type"],
       "RMSE": [SBC, PCK,PPLN,IPC,ALC],}
one_predictor = pd.DataFrame(data, columns = ["Predictors", "RMSE"])
one_predictor

Unnamed: 0,Predictors,RMSE
0,State Bottle Cost,1.004917
1,Pack,0.997771
2,Population,0.99858
3,Income Per Capita,1.006193
4,Alcohol Type,0.984455


In [39]:
data = {"Test":["Test 1", "Test 2", "Test 3", "Test 4", "Test 5"],
       "RMSE": [two_pr1, two_pr2, three_pr1, four_pr1, five_pr],
       "Predictors": ["ALC, PCK",
                     "ALC, SBC",
                     "ALC,PCK,SBC",
                     "ALC,PCK,SBC,IPC",
                     "ALC,PCK,SBC,IPC,PPLN"],}
models = pd.DataFrame(data, columns = ["Test", "RMSE", "Predictors"])
models

Unnamed: 0,Test,RMSE,Predictors
0,Test 1,0.979205,"ALC, PCK"
1,Test 2,0.983649,"ALC, SBC"
2,Test 3,0.968834,"ALC,PCK,SBC"
3,Test 4,0.969094,"ALC,PCK,SBC,IPC"
4,Test 5,0.979095,"ALC,PCK,SBC,IPC,PPLN"


In [40]:
data = {"Predictor": ["SBC","PCK","PPLN","IPC","ALC"],
       "Mapping": ["State Bottle Cost","Pack","Population","Income Per Capita","Alcohol Type"],}
mapping = pd.DataFrame(data, columns = ["Predictor", "Mapping"])
mapping

Unnamed: 0,Predictor,Mapping
0,SBC,State Bottle Cost
1,PCK,Pack
2,PPLN,Population
3,IPC,Income Per Capita
4,ALC,Alcohol Type


# Testing Models

In [31]:
# 1 predictor: State Bottle Cost
df_test = df[["State Bottle Cost","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
SBC = out[1]

#test_errs = []
#for i in range(5,50):
#    folds = k_fold(i, df_test)
#    out = cv(folds, response)
#    test_errs.append(out[1])
#plt.plot(test_errs)
#print("K-Folds = ", test_errs.index(min(test_errs)))
#print("Minimum RMSE = ", min(test_errs))

test errors: [-0.9657166812477781, -1.0177505840751349, -0.9654639247161498, -1.170793922382426, -0.9295642751924557]
     
rmse: 1.004916851049274


In [None]:
# 1 predictor: State Bottle Retail
df_test = df[["State Bottle Retail","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
SBR = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 1 predictor: Bottles Sold Per Capita
df_test = df[["Bottles Sold Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
BSPC = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [27]:
# 1 predictor: Pack
df_test = df[["Pack","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
PCK = out[1]

#test_errs = []
#for i in range(5,50):
#    folds = k_fold(i, df_test)
#    out = cv(folds, response)
#    test_errs.append(out[1])
#plt.plot(test_errs)
#print("K-Folds = ", test_errs.index(min(test_errs)))
#print("Minimum RMSE = ", min(test_errs))

test errors: [-0.9402652751972571, -1.290238128623381, -0.8507876228891282, -1.0597884378553641, -0.8366571106387785]
     
rmse: 0.9977711736870242


In [None]:
# 1 predictor: Alcohol Expense Per Capita
df_test = df[["Alcohol Expense Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
AEPC = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [28]:
# 1 predictor: Population
df_test = df[["Population","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
PPLN = out[1]

#test_errs = []
#for i in range(5,50):
#    folds = k_fold(i, df_test)
#    out = cv(folds, response)
#    test_errs.append(out[1])
#plt.plot(test_errs)
#print("K-Folds = ", test_errs.index(min(test_errs)))
#print("Minimum RMSE = ", min(test_errs))

test errors: [-0.8811549388057985, -0.8898072085261667, -0.7538741809742675, -1.752540864838392, -0.7084337738505132]
     
rmse: 0.9985800886253579


In [29]:
# 1 predictor: Income Per Capita
df_test = df[["Income Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
IPC = out[1]

#test_errs = []
#for i in range(5,50):
#    folds = k_fold(i, df_test)
#    out = cv(folds, response)
#    test_errs.append(out[1])
#plt.plot(test_errs)
#print("K-Folds = ", test_errs.index(min(test_errs)))
#print("Minimum RMSE = ", min(test_errs))

test errors: [-0.9547656982034647, -1.0848877499969352, -0.9307411163233393, -1.1729577895977723, -0.9187732404608902]
     
rmse: 1.0061933804773713


In [30]:
# 1 predictor: Alcohol Type
df_test = df[["Consolidated_Brandy","Consolidated_Gin","Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey"
              ,"Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
ALC = out[1]

#test_errs = []
#for i in range(5,50):
#    folds = k_fold(i, df_test)
#    out = cv(folds, response)
#    test_errs.append(out[1])
#plt.plot(test_errs)
#print("K-Folds = ", test_errs.index(min(test_errs)))
#print("Minimum RMSE = ", min(test_errs))

test errors: [-0.962995199333141, -0.9794195993498142, -0.9315383647199771, -1.1144029128154342, -0.8574028764284324]
     
rmse: 0.9844550728851773


In [None]:
df.head()

In [None]:
data = {"Predictors": ["State Bottle Cost", "State Bottle Retail","Bottles Sold Per Capita",
                       "Pack", "Alcohol Expense Per Capita","Population","Income Per Capita","Alcohol Type"],
       "RMSE": [SBC, SBR, BSPC,PCK,AEPC,PPLN,IPC,ALC],}
one_predictor = pd.DataFrame(data, columns = ["Predictors", "RMSE"])
one_predictor

# Best individual predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita

In [None]:
# 2 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
two_pr = out[1] 

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

# Next 4: State Bottle Cost, State Bottle Retail, Income Per Capita, Population equal weight 

In [None]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, "State Bottle Cost"
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","State Bottle Cost",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
three_pr1 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, "State Bottle Retail"
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","State Bottle Retail",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
three_pr2 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Population
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Population",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Alcohol Type
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Consolidated_Brandy","Consolidated_Gin",
              "Consolidated_Rum","Consolidated_Schnapps","Consolidated_Tequila",
              "Consolidated_Vodka","Consolidated_Whiskey", "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack", "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
three_pr3 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

# Best 3 Predictor Model:  Bottles Sold Per Capita * Alcohol Expense Per Capita * Pack

In [None]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, Population
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita", "Pack","Population",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, State Bottle Retail
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita", "Pack","State Bottle Retail",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
four_pr1 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, Income Per Capita
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack", "Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, State Bottle Cost
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack", "State Bottle Cost",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
four_pr2 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, Alcohol Type
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack", "Consolidated_Brandy",
              "Consolidated_Gin", "Consolidated_Rum","Consolidated_Schnapps","Consolidated_Tequila",
              "Consolidated_Vodka","Consolidated_Whiskey", "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

# Best 4 Predictor Model:  Bottles Sold Per Capita * Alcohol Expense Per Capita * Pack * State Bottle Cost

In [None]:
# 5 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, State Bottle Cost, State Bottle Retail
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack",
              "State Bottle Retail","State Bottle Cost",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
five_pr1 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 5 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, State Bottle Cost, Income Per capita
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack", 
              "State Bottle Cost","Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
five_pr2 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 5 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, State Bottle Cost, Alcohol Type
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack", 
              "State Bottle Cost","Consolidated_Brandy","Consolidated_Gin", "Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 5 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Pack, State Bottle Cost, Population
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Pack", 
              "State Bottle Cost","Population",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

# Best 5 Predictor Model:  Bottles Sold Per Capita * Alcohol Expense Per Capita * Income Per Capita * State Bottle Cost * Pack

In [None]:
# 6 predictors: State Bottle Cost, State Bottle Retail, Pack,
#             Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita

df_test = df[["State Bottle Cost","State Bottle Retail","Pack","Bottles Sold Per Capita",
              "Alcohol Expense Per Capita","Income Per Capita","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
six_pr1 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 6 predictors: State Bottle Cost, Population, Pack,
#             Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita

df_test = df[["State Bottle Cost","Income Per Capita","Pack","Bottles Sold Per Capita",
              "Alcohol Expense Per Capita","Population","Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 6 predictors: State Bottle Cost, Alcohol Type, Pack,
#             Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita

df_test = df[["State Bottle Cost","Income Per Capita","Pack","Bottles Sold Per Capita",
              "Alcohol Expense Per Capita","Consolidated_Brandy","Consolidated_Gin", "Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey"
              ,"Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
six_pr2 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

# Best 6 Predictor Model:  Bottles Sold Per Capita * Alcohol Expense Per Capita * Income Per Capita * State Bottle Cost * Pack * State Bottle Retail

In [None]:
# 7 predictors: State Bottle Cost, State Bottle Retail, Alcohol Type, Pack,
#             Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita

df_test = df[["State Bottle Cost","State Bottle Retail","Pack","Bottles Sold Per Capita",
              "Alcohol Expense Per Capita","Income Per Capita","Consolidated_Brandy","Consolidated_Gin", "Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey"
              ,"Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
seven_pr1 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

In [None]:
# 7 predictors: State Bottle Cost, State Bottle Retail, Population, Pack,
#             Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita

df_test = df[["State Bottle Cost","State Bottle Retail","Pack","Bottles Sold Per Capita",
              "Alcohol Expense Per Capita","Income Per Capita","Population",
              "Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

#  Best 7 Predictor Model:  Bottles Sold Per Capita * Alcohol Expense Per Capita * Income Per Capita * State Bottle Cost * Pack * State Bottle Retail * Alcohol Type

In [None]:
# 8 predictors: State Bottle Cost, State Bottle Retail, Alcohol Type, Pack,
#             Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita, Population

df_test = df[["State Bottle Cost","State Bottle Retail","Pack","Bottles Sold Per Capita","Population",
              "Alcohol Expense Per Capita","Income Per Capita","Consolidated_Brandy","Consolidated_Gin", "Consolidated_Rum",
              "Consolidated_Schnapps","Consolidated_Tequila","Consolidated_Vodka","Consolidated_Whiskey"
              ,"Volume Sold (Gallons) Per Capita"]]
df_test = standardize(df_test)
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
eight_pr1 = out[1]

test_errs = []
for i in range(5,50):
    folds = k_fold(i, df_test)
    out = cv(folds, response)
    test_errs.append(out[1])
plt.plot(test_errs)
print("K-Folds = ", test_errs.index(min(test_errs)))
print("Minimum RMSE = ", min(test_errs))

# Best Models 

In [None]:
data = {"Predictors": ["State Bottle Cost", "State Bottle Retail","Bottles Sold Per Capita",
                       "Pack", "Alcohol Expense Per Capita","Population","Income Per Capita", "Alcohol Type"],
       "RMSE": [SBC, SBR, BSPC,PCK,AEPC,PPLN,IPC,ALC],}
one_predictor = pd.DataFrame(data, columns = ["Predictors", "RMSE"])
one_predictor

In [None]:
data = {"Test":["Test 1", "Test 2", "Test 3", "Test 4", "Test 5", "Test 6", "Test 7", "Test 8",
               "Test 9","Test 10", "Test 11","Test 12"],
       "RMSE": [two_pr, three_pr1, three_pr2, three_pr3, four_pr1, four_pr2, five_pr1, 
                five_pr2, six_pr1, six_pr2, seven_pr1, eight_pr1],
       "Predictors": ["BSPC*AEPC",
                      "BSPC*AEPC*SBC",
                      "BSPC*AEPC*SBR",
                      "BSPC*AEPC*PCK",
                      "BSPC*AEPC*PCK*SBR",
                      "BSPC*AEPC*PCK*SBC",
                      "BSPC*AEPC*PCK*SBC*SBR",
                      "BSPC*AEPC*PCK*SBC*IPC",
                      "BSPC*AEPC*PCK*SBC*IPC*SBR",
                      "BSPC*AEPC*PCK*SBC*IPC*ALC",
                      "BSPC*AEPC*PCK*SBC*IPC*SBR*ALC",
                      "BSPC*AEPC*PCK*SBC*IPC*SBR*ALC*PPLN"],}
models = pd.DataFrame(data, columns = ["Test", "RMSE", "Predictors"])
models

In [None]:
data = {"Predictor": ["BSPC","AEPC","SBC","PCK","PPLN","SBR","IPC","ALC"],
       "Mapping": ["Bottles Sold Per Capita", "Alcohol Expense Per Capita", "State Bottle Cost","Pack",
                  "Population", "State Bottle Retail", "Income Per Capita","Alcohol Type"],}
mapping = pd.DataFrame(data, columns = ["Predictor", "Mapping"])
mapping