In [2]:
import pandas as pd
import numpy as np
from numpy.linalg import inv
import sklearn

In [3]:
# linear regression methodology
class LR:
    
    def fit(self, X_train, y_train):
        # create vector of ones...
        ones = np.ones(shape=len(X_train))[..., None]
        #...and add to feature matrix
        X = np.concatenate((ones, X_train), 1)
        #calculate coefficients using closed-form solution
        self.coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y_train)
        
    def predict(self, X_test):
        ones = np.ones(shape=len(X_test))[..., None]
        X_test = np.concatenate((ones, X_test), 1)
        y_hat = X_test.dot(self.coeffs)
        return y_hat


In [4]:
# sklear linear regression example
from sklearn.linear_model import LinearRegression

data_dir = "https://dlsun.github.io/pods/data/"
bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
                          index_col="year")

bordeaux_train = bordeaux_df.loc[:1980].copy()
bordeaux_test = bordeaux_df.loc[1980:].copy()

X_train = bordeaux_train[["age"]]
X_test = bordeaux_test[["age"]]
y_train = bordeaux_train["price"]

model = LinearRegression()
model.fit(X=X_train, y=y_train)
model.predict(X=X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [5]:
# our linear regression example on same data
model = LR()
model.fit(X_train, y_train)
model.predict(X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [6]:
# cross validation methodology
def k_fold(k, df):
    n = len(df)
    cut = int(n/k)
    folds = []
    start = 0
    end = cut
    for i in range(0, k):
        fold = df[start: end]
        folds.append(fold)
        start += int(n/k)
        end += int(n/k)
    return folds

def mse(actual, predicted):
    return -(((actual - predicted)**2).mean())

def cv(folds, response):
    test_errors = []
    for i in range(0, len(folds)):
        X_train = pd.DataFrame()
        Y_train = pd.Series(dtype=float)
        for j in range(0, len(folds)):
            if i == j:
                X_test = folds[j].drop([response], axis=1)
                Y_test = folds[j][response]
            if i != j:
                X_train = X_train.append(folds[j].drop([response], axis=1))
                Y_train = Y_train.append(folds[j][response])
    
        model = LR()
        model.fit(X_train, Y_train.transpose())
        Y_pred = pd.DataFrame()
        Y_pred = model.predict(X_test)
        error = mse(Y_test, Y_pred) 
        test_errors.append(error)
        rmse = np.sqrt(-sum(test_errors)/len(folds))
    return test_errors, rmse

In [7]:
# sklearn cross validation example
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

data_dir = "https://dlsun.github.io/pods/data/"
bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
                          index_col="year")

bordeaux_train = bordeaux_df.loc[:1980].copy()
bordeaux_train["log(price)"] = np.log(bordeaux_train["price"])
bordeaux_train.drop(["price"], axis=1, inplace=True)

model = LinearRegression()
scores = cross_val_score(model, 
                         X=bordeaux_train[["win", "summer"]],
                         y=bordeaux_train["log(price)"],
                         scoring="neg_mean_squared_error",
                         cv=2)

print(scores)
print(np.sqrt(-scores).mean())

[-0.29898833 -0.27833086]
0.5371844859149082


In [8]:
# our cross validation on same data
bordeaux_train2 = bordeaux_train[["win", "summer", "log(price)"]]
folds = k_fold(2, bordeaux_train2)
errs, rmse = cv(folds, "log(price)")
print(errs)
print(rmse)


[-0.32615365834155635, -0.28766838705142556]
0.5539955078306059


In [9]:
# Standardizing the Dataframe

def standardize(df): 
    #standardize only quantitative variables
    df_st = ((df.select_dtypes(float) - df.select_dtypes(float).mean()) / df.select_dtypes(float).std()) 

    #join the standardized quantites back with original df 
    df_st = df.select_dtypes(exclude=float).join(df_st)
    return df_st 

In [10]:
df = pd.read_csv("iowa_month_county.csv") 
df.set_index(["County", "Month-Year"], inplace=True)

halfway = len(df)//2

df_train = df.iloc[:halfway].copy()
df_test = df.iloc[halfway:].copy()

In [11]:
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pack,State Bottle Cost,State Bottle Retail,Bottles Sold Per Capita,Alcohol Expense Per Capita,Volume Sold (Gallons) Per Capita,Category Name_100 PROOF VODKA,Category Name_AMERICAN ALCOHOL,Category Name_AMERICAN AMARETTO,Category Name_AMERICAN COCKTAILS,...,Vendor Name_Sazerac North America,Vendor Name_Shaw Ross International Importers LL,Vendor Name_Sidney Frank Importing Co.,Vendor Name_Stoli Group,Vendor Name_The Patron Spirits Company,"Vendor Name_WILLIAM GRANT AND SONS, INC.",Vendor Name_Western Spirits Beverage Co. LLC,Vendor Name_Wilson Daniels Ltd.,Population,Income Per Capita
County,Month-Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Adair,01-2013,12,11.0,16.49,0.000406,0.006697,8e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,7387,45559
Adair,09-2012,24,4.3,6.45,0.000536,0.003455,5.4e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,7468,42093
Adair,12-2012,6,10.9,16.35,0.000402,0.006568,0.000186,0,0,0,0,...,0,0,0,0,0,0,0,0,7468,42093
Allamakee,03-2015,12,15.24,22.86,0.000144,0.003295,3.8e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,13874,43992
Allamakee,06-2012,24,4.28,6.42,0.000141,0.000907,1.4e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,14149,38814


In [12]:
y_train = df_train[["Volume Sold (Gallons) Per Capita"]]
x_train = df_train[["State Bottle Cost"]]
x_test = df_train[["State Bottle Cost"]]
model_sk = LinearRegression()
model.fit(x_train, y_train)
model.predict(y_train)[0:5]

array([[0.00018781],
       [0.00018781],
       [0.00018781],
       [0.00018781],
       [0.00018781]])

# Testing Models

In [13]:
# 1 predictor: State Bottle Cost
df_test = df[["State Bottle Cost","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
SBC = out[1]

test errors: [-2.4585635714417113e-08, -2.1904064573763683e-06, -1.783517370956287e-08, -3.635177631879165e-08, -1.3814078635914039e-08]
     
rmse: 0.0006757208183495687


In [14]:
# 1 predictor: State Bottle Retail
df_test = df[["State Bottle Retail","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
SBR = out[1]

test errors: [-2.4548318282995075e-08, -2.1904298674775862e-06, -1.7831916405602842e-08, -3.6354815621143634e-08, -1.3793618063155915e-08]
     
rmse: 0.0006757156999582714


In [15]:
# 1 predictor: Bottles Sold Per Capita
df_test = df[["Bottles Sold Per Capita","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
BSPC = out[1]

test errors: [-7.808330087960089e-09, -2.4471474734285886e-07, -4.828666584542065e-09, -5.786802086523e-09, -1.0402344440046882e-08]
     
rmse: 0.00023389779415032149


In [16]:
# 1 predictor: Pack
df_test = df[["Pack","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
PCK = out[1]

test errors: [-2.3772477323122852e-08, -2.1857416129136227e-06, -1.8460084815705892e-08, -4.437326750101643e-08, -1.496111493035482e-08]
     
rmse: 0.0006763591586551959


In [17]:
# 1 predictor: Alcohol Expense Per Capita
df_test = df[["Alcohol Expense Per Capita","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
AEPC = out[1]

test errors: [-2.0619749830832755e-08, -3.297525003983162e-07, -6.339368888568657e-09, -1.0441248576174266e-08, -5.581599163366202e-09]
     
rmse: 0.0002730327697758121


In [18]:
# 1 predictor: Population
df_test = df[["Population","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
PPLN = out[1]

test errors: [-2.5370800392228448e-08, -2.182281218370422e-06, -1.654240595398054e-08, -6.375504864680642e-08, -1.30968654492098e-08]
     
rmse: 0.0006783872550118622


In [19]:
# 1 predictor: Income Per Capita
df_test = df[["Income Per Capita","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
IPC = out[1]

test errors: [-2.5790768644107747e-08, -2.182694672763314e-06, -2.3484836683978343e-08, -3.977371271874168e-08, -1.315292251841389e-08]
     
rmse: 0.0006760025019670497


In [20]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pack,State Bottle Cost,State Bottle Retail,Bottles Sold Per Capita,Alcohol Expense Per Capita,Volume Sold (Gallons) Per Capita,Category Name_100 PROOF VODKA,Category Name_AMERICAN ALCOHOL,Category Name_AMERICAN AMARETTO,Category Name_AMERICAN COCKTAILS,...,Vendor Name_Sazerac North America,Vendor Name_Shaw Ross International Importers LL,Vendor Name_Sidney Frank Importing Co.,Vendor Name_Stoli Group,Vendor Name_The Patron Spirits Company,"Vendor Name_WILLIAM GRANT AND SONS, INC.",Vendor Name_Western Spirits Beverage Co. LLC,Vendor Name_Wilson Daniels Ltd.,Population,Income Per Capita
County,Month-Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Adair,01-2013,12,11.0,16.49,0.000406,0.006697,8e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,7387,45559
Adair,09-2012,24,4.3,6.45,0.000536,0.003455,5.4e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,7468,42093
Adair,12-2012,6,10.9,16.35,0.000402,0.006568,0.000186,0,0,0,0,...,0,0,0,0,0,0,0,0,7468,42093
Allamakee,03-2015,12,15.24,22.86,0.000144,0.003295,3.8e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,13874,43992
Allamakee,06-2012,24,4.28,6.42,0.000141,0.000907,1.4e-05,0,0,0,0,...,0,0,0,0,0,0,0,0,14149,38814


In [21]:
data = {"Predictors": ["State Bottle Cost", "State Bottle Retail","Bottles Sold Per Capita",
                       "Pack", "Alcohol Expense Per Capita","Population","Income Per Capita"],
       "RMSE": [SBC, SBR, BSPC,PCK,AEPC,PPLN,IPC],}
one_predictor = pd.DataFrame(data, columns = ["Predictors", "RMSE"])
one_predictor

Unnamed: 0,Predictors,RMSE
0,State Bottle Cost,0.000676
1,State Bottle Retail,0.000676
2,Bottles Sold Per Capita,0.000234
3,Pack,0.000676
4,Alcohol Expense Per Capita,0.000273
5,Population,0.000678
6,Income Per Capita,0.000676


# Best individual predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita

In [22]:
# 2 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
two_pr = out[1] 

test errors: [-8.134054386011844e-09, -1.877217503192073e-07, -3.4211795644363285e-09, -6.1266621759954336e-09, -6.154575867312535e-09]
     
rmse: 0.00020569794472136247


# Next 4: State Bottle Cost, State Bottle Retail, Income Per Capita, Population equal weight 

In [23]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, "State Bottle Cost"
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","State Bottle Cost",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
three_pr1 = out[1]

test errors: [-8.139459233228457e-09, -1.8759422625687222e-07, -3.427992003765268e-09, -6.2823441687316515e-09, -6.26399320809865e-09]
     
rmse: 0.00020577075344698344


In [24]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, "State Bottle Retail"
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","State Bottle Retail",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-8.139606303967927e-09, -1.8759190517437608e-07, -3.428337557647427e-09, -6.281544906643993e-09, -6.264626113555112e-09]
     
rmse: 0.00020576978400931004


In [25]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Income Per Capita
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-8.12856907892253e-09, -1.8943616840685216e-07, -3.4609673677985386e-09, -6.122286619897543e-09, -6.170370867085358e-09]
     
rmse: 0.00020655186386985528


In [26]:
# 3 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Population
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Population",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
three_pr2 = out[1]

test errors: [-8.07990185882108e-09, -1.8018997851500133e-07, -3.12421086333651e-09, -8.416478068618714e-09, -5.939691691962513e-09]
     
rmse: 0.00020285475641342016


# Best 3 Predictor Model:  Bottles Sold Per Capita * Alcohol Expense Per Capita * Population

In [27]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Cost
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Population", "State Bottle Cost",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
four_pr1 = out[1]

test errors: [-8.25190431010812e-09, -1.787546881294056e-07, -3.088337426698321e-09, -8.03959311739174e-09, -5.772891657561333e-09]
     
rmse: 0.0002019442569825471


In [28]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Retail
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Population", "State Bottle Retail",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
four_pr2 = out[1]

test errors: [-8.251176288550117e-09, -1.7875351260142967e-07, -3.0878612387894417e-09, -8.03622357897488e-09, -5.773326592553913e-09]
     
rmse: 0.00020194162537738377


In [29]:
# 4 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Cost
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Population", "Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])

test errors: [-8.066975134424308e-09, -1.826467044023561e-07, -3.1418009543159608e-09, -8.401079068482912e-09, -5.9242421612341765e-09]
     
rmse: 0.0002040494066253629


# Best 4 Predictor Model:  Bottles Sold Per Capita * Alcohol Expense Per Capita * Population * State Bottle Retail

In [30]:
# 5 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Retail, State Bottle Cost
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Population",
              "State Bottle Retail","State Bottle Cost",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
five_pr1 = out[1]

test errors: [-8.254530681907356e-09, -1.7879349447574e-07, -3.109229579794035e-09, -8.595027686370445e-09, -5.771162436124802e-09]
     
rmse: 0.00020224907656646378


In [33]:
# 5 predictors: Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Retail, Income Per capita
df_test = df[["Bottles Sold Per Capita","Alcohol Expense Per Capita","Population", 
              "State Bottle Retail","Income Per Capita",
              "Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
five_pr2 = out[1]

test errors: [-8.230429012695296e-09, -1.811998655723021e-07, -3.100039045728608e-09, -8.023162474022284e-09, -5.763369734330036e-09]
     
rmse: 0.00020313387991129314


In [32]:
# 6 predictors: State Bottle Cost, State Bottle Retail, Pack,
#             Bottles Sold Per Capita, Alcohol Expense Per Capita, Population

df_test = df[["State Bottle Cost","State Bottle Retail","Pack","Bottles Sold Per Capita",
              "Alcohol Expense Per Capita","Population","Volume Sold (Gallons) Per Capita"]]
response = "Volume Sold (Gallons) Per Capita"
folds = k_fold(5,df_test)
out = cv(folds, response)
print("test errors:", out[0])
print("     ")
print("rmse:", out[1])
six_pr = out[1]

test errors: [-7.685231134185963e-09, -1.7233908988289865e-07, -3.1475118357743565e-09, -8.566154340777614e-09, -5.344184986210776e-09]
     
rmse: 0.00019853572584290584


# Best Models 

In [None]:
data = {"Predictors": ["State Bottle Cost", "State Bottle Retail","Bottles Sold Per Capita",
                       "Pack", "Alcohol Expense Per Capita","Population","Income Per Capita"],
       "RMSE": [SBC, SBR, BSPC,PCK,AEPC,PPLN,IPC],}
one_predictor = pd.DataFrame(data, columns = ["Predictors", "RMSE"])
one_predictor

In [38]:
data = {"Test":["Test 1", "Test 2", "Test 3", "Test 4", "Test 5", "Test 6", "Test 7", "Test 8"],
       "RMSE": [two_pr, three_pr1, three_pr2, four_pr1, four_pr2, five_pr1, five_pr2, six_pr],
       "Predictors": ["Bottles Sold Per Capita, Alcohol Expense Per Capita",
                      "Bottles Sold Per Capita, Alcohol Expense Per Capita,State Bottle Cost",
                      "Bottles Sold Per Capita, Alcohol Expense Per Capita, Population",
                      "Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Cost",
                      "Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Retail",
                      "Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Retail, State Bottle Cost", 
                      "Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Retail, Income Per Capita", 
                      "Bottles Sold Per Capita, Alcohol Expense Per Capita, Population, State Bottle Retail, State Bottle Cost, Income Per Capita"],}
models = pd.DataFrame(data, columns = ["Test", "RMSE", "Predictors"])
models

Unnamed: 0,Test,RMSE,Predictors
0,Test 1,0.000206,"Bottles Sold Per Capita, Alcohol Expense Per C..."
1,Test 2,0.000206,"Bottles Sold Per Capita, Alcohol Expense Per C..."
2,Test 3,0.000203,"Bottles Sold Per Capita, Alcohol Expense Per C..."
3,Test 4,0.000202,"Bottles Sold Per Capita, Alcohol Expense Per C..."
4,Test 5,0.000202,"Bottles Sold Per Capita, Alcohol Expense Per C..."
5,Test 6,0.000202,"Bottles Sold Per Capita, Alcohol Expense Per C..."
6,Test 7,0.000203,"Bottles Sold Per Capita, Alcohol Expense Per C..."
7,Test 8,0.000199,"Bottles Sold Per Capita, Alcohol Expense Per C..."


In [40]:
data = {"Test":["Test 1", "Test 2", "Test 3", "Test 4", "Test 5", "Test 6", "Test 7", "Test 8"],
       "RMSE": [two_pr, three_pr1, three_pr2, four_pr1, four_pr2, five_pr1, five_pr2, six_pr],
       "Predictors": ["BSPC*AEPC",
                      "BSPC*AEPC*SBC",
                      "BSPC*AEPC*PPLN",
                      "BSPC*AEPC*PPLN*SBC",
                      "BSPC*AEPC*PPLN*SBR",
                      "BSPC*AEPC*PPLN*SBR*SBC", 
                      "BSPC*AEPC*PPLN*SBR*IPC", 
                      "BSPC*AEPC*PPLN*SBR*SBC*IPC"],}
models = pd.DataFrame(data, columns = ["Test", "RMSE", "Predictors"])
models

Unnamed: 0,Test,RMSE,Predictors
0,Test 1,0.000206,BSPC*AEPC
1,Test 2,0.000206,BSPC*AEPC*SBC
2,Test 3,0.000203,BSPC*AEPC*PPLN
3,Test 4,0.000202,BSPC*AEPC*PPLN*SBC
4,Test 5,0.000202,BSPC*AEPC*PPLN*SBR
5,Test 6,0.000202,BSPC*AEPC*PPLN*SBR*SBC
6,Test 7,0.000203,BSPC*AEPC*PPLN*SBR*IPC
7,Test 8,0.000199,BSPC*AEPC*PPLN*SBR*SBC*IPC


In [41]:
data = {"Predictor": ["BSPC","AEPC","SBC","PPLN","SBR","IPC"],
       "Mapping": ["Bottles Sold Per Capita", "Alcohol Expense Per Capita", "State Bottle Cost",
                  "Population", "State Bottle Retail", "Income Per Capita"],}
mapping = pd.DataFrame(data, columns = ["Predictor", "Mapping"])
mapping

Unnamed: 0,Predictor,Mapping
0,BSPC,Bottles Sold Per Capita
1,AEPC,Alcohol Expense Per Capita
2,SBC,State Bottle Cost
3,PPLN,Population
4,SBR,State Bottle Retail
5,IPC,Income Per Capita
