In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import inv

In [2]:
df = pd.read_csv("iowa-sample.csv")
df.head()

Unnamed: 0,Invoice/Item Number,Date,Store Number,Store Name,Address,City,Zip Code,Store Location,County Number,County,...,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,S08096000008,10/04/2012,4641,Kum & Go #573 / SE 14th DM,5830 SE 14th ST,DES MOINES,50315,,77,Polk,...,89191,Jose Cuervo Especial Reposado Tequila Mini,12,500,11.5,17.25,1,17.25,0.5,0.13
1,S23102300041,12/20/2014,4346,Roy's Foodland,105 PEARL ST,SHELLSBURG,52332,POINT (-91.869285 42.094155),6,Benton,...,11774,Black Velvet,24,375,3.07,4.61,6,27.66,2.25,0.59
2,S14410500035,09/09/2013,3628,Wal-Mart 1528 / Cedar Rapids,2645 BLAIRS FERRY RD NE,CEDAR RAPIDS,52402,POINT (-91.680734 42.034748),57,Linn,...,43410,Captain Morgan Parrot Bay Coconut,12,750,7.49,11.23,12,134.76,9.0,2.38
3,S09427600021,12/12/2012,4708,No Frills Supermarkets #803 / Glenwo,423 SHARP ST,GLENWOOD,51534,POINT (-95.742987 41.04635),65,Mills,...,24458,Kessler Blend Whiskey,6,1750,11.01,16.52,30,495.6,52.5,13.87
4,S28446900114,10/14/2015,2594,Hy-Vee Food Store / Sioux City,4500 SERGEANT ROAD,SIOUX CITY,51106,POINT (-96.346969 42.447396),97,Woodbury,...,10550,Black Velvet Toasted Caramel,12,750,6.75,10.13,12,121.56,9.0,2.38


In [3]:
# Initial Feature Extration and Type Casting
df = pd.read_csv("iowa-sample.csv")

# DROP FEATURES THAT REPLICATE DATA
cols_to_drop = ["Invoice/Item Number", "Store Number", "Address", 
                "County Number", "Category", "Vendor Number", 
                "Volume Sold (Liters)", "Bottle Volume (ml)"]

df.drop(cols_to_drop, axis=1, inplace=True)


# TYPE MUNGING
categorical_cols = ["Store Name", "City", "Zip Code", "Store Location",
                   "County", "Category Name", "Vendor Name"]
for col in categorical_cols:
    df[col] = df[col].astype('category')

# TIME MUNGING
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.insert(1, 'Year', df['Date'].dt.year)

df.head()

Unnamed: 0,Date,Year,Store Name,City,Zip Code,Store Location,County,Category Name,Vendor Name,Item Number,Item Description,Pack,State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Gallons)
0,2012-10-04,2012,Kum & Go #573 / SE 14th DM,DES MOINES,50315,,Polk,TEQUILA,Proximo,89191,Jose Cuervo Especial Reposado Tequila Mini,12,11.5,17.25,1,17.25,0.13
1,2014-12-20,2014,Roy's Foodland,SHELLSBURG,52332,POINT (-91.869285 42.094155),Benton,CANADIAN WHISKIES,"Constellation Wine Company, Inc.",11774,Black Velvet,24,3.07,4.61,6,27.66,0.59
2,2013-09-09,2013,Wal-Mart 1528 / Cedar Rapids,CEDAR RAPIDS,52402,POINT (-91.680734 42.034748),Linn,FLAVORED RUM,Diageo Americas,43410,Captain Morgan Parrot Bay Coconut,12,7.49,11.23,12,134.76,2.38
3,2012-12-12,2012,No Frills Supermarkets #803 / Glenwo,GLENWOOD,51534,POINT (-95.742987 41.04635),Mills,BLENDED WHISKIES,Jim Beam Brands,24458,Kessler Blend Whiskey,6,11.01,16.52,30,495.6,13.87
4,2015-10-14,2015,Hy-Vee Food Store / Sioux City,SIOUX CITY,51106,POINT (-96.346969 42.447396),Woodbury,CANADIAN WHISKIES,"Constellation Wine Company, Inc.",10550,Black Velvet Toasted Caramel,12,6.75,10.13,12,121.56,2.38


In [None]:
# Standardizing the Dataframe

def standardize(df): 
  #standardize only quantitative variables
  df_st = ((df.select_dtypes(float) - df.select_dtypes(float).mean()) / df.select_dtypes(float).std()) 

  #join the standardized quantites back with original df 
  df_st = df.select_dtypes(exclude=float).join(df_st)
  return df_st 

In [4]:
class LR:
    
    def fit(self, X_train, y_train):
        # create vector of ones...
        ones = np.ones(shape=len(X_train))[..., None]
        #...and add to feature matrix
        X = np.concatenate((ones, X_train), 1)
        #calculate coefficients using closed-form solution
        self.coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y_train)
        
    def predict(self, X_test):
        ones = np.ones(shape=len(X_test))[..., None]
        X_test = np.concatenate((ones, X_test), 1)
        y_hat = X_test.dot(self.coeffs)
        return y_hat


In [5]:
from sklearn.linear_model import LinearRegression

data_dir = "https://dlsun.github.io/pods/data/"
bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
                          index_col="year")

bordeaux_train = bordeaux_df.loc[:1980].copy()
bordeaux_test = bordeaux_df.loc[1980:].copy()

X_train = bordeaux_train[["age"]]
X_test = bordeaux_test[["age"]]
y_train = bordeaux_train["price"]

model = LinearRegression()
model.fit(X=X_train, y=y_train)
model.predict(X=X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [6]:
model = LR()
model.fit(X_train, y_train)
model.predict(X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [7]:
# EXTERNAL DATASET = IOWA POPULATION PER (COUNTY, YEAR)
df2 = pd.read_excel("iowa_county.xlsx")

# DATA CLEANING
df2 = df2.iloc[4:103]
df2.drop(["Unnamed: 1", "Unnamed: 2"], axis=1, inplace=True)
df2.rename(columns=
{"table with row headers in column A " +
 "and column headers in rows 3 through " +
 "4 (leading dots indicate sub-parts)": "County",
                    "Unnamed: 3": "2010", 
                    "Unnamed: 4": "2011", 
                    "Unnamed: 5": "2012", 
                    "Unnamed: 6": "2013", 
                    "Unnamed: 7": "2014",
                    "Unnamed: 8": "2015",
                    "Unnamed: 9": "2016",
                    "Unnamed: 10": "2017",
                    "Unnamed: 11": "2018",
                    "Unnamed: 12": "2019"}, inplace=True)
df2.reset_index(drop=True, inplace=True)
     
df2["County"] = df2["County"].str.replace(".", "")
df2["County"] = df2["County"].str.replace(" County, Iowa", "")

# CONVERT DF TO STANDARD FORM
df2 = df2.melt(var_name="Year", value_name="Population", id_vars=['County'])

# TYPE CASTING
df2["County"] = df2["County"].astype('category')
df2["Year"] = df2["Year"].astype('int')
df2["Population"] = df2["Population"].astype('int')
df2.head()

Unnamed: 0,County,Year,Population
0,Adair,2010,7679
1,Adams,2010,4023
2,Allamakee,2010,14378
3,Appanoose,2010,12856
4,Audubon,2010,6098


In [8]:
# COMBINE IOWA LIQUOR WITH IOWA COUNTY TO GET POPULATION VALUES
df3 = pd.merge(df, df2, on=["County", "Year"])

# PER CAPITA NORMALIZATION
df3["Bottles Sold Per Capita"] = df3["Bottles Sold"]/df3["Population"]
df3["Alcohol Expense Per Capita"] = df3["Sale (Dollars)"]/df3["Population"]
df3["Volume Sold (Gallons) Per Capita"] = df3["Volume Sold (Gallons)"]/df3["Population"]

cols_to_drop = ["Bottles Sold", "Sale (Dollars)", "Volume Sold (Gallons)"]
df3.drop(cols_to_drop, axis=1, inplace=True)

df3.head()

Unnamed: 0,Date,Year,Store Name,City,Zip Code,Store Location,County,Category Name,Vendor Name,Item Number,Item Description,Pack,State Bottle Cost,State Bottle Retail,Population,Bottles Sold Per Capita,Alcohol Expense Per Capita,Volume Sold (Gallons) Per Capita
0,2012-10-04,2012,Kum & Go #573 / SE 14th DM,DES MOINES,50315,,Polk,TEQUILA,Proximo,89191,Jose Cuervo Especial Reposado Tequila Mini,12,11.5,17.25,444671,2e-06,3.9e-05,2.92351e-07
1,2012-11-15,2012,"Fast Mart 2, LLC",WEST DES MOINES,50265,POINT (-93.741511 41.580206),Polk,VODKA 80 PROOF,E AND J GALLO WINERY,36969,New Amsterdam Vodka,12,6.9,10.35,444671,7e-06,7e-05,1.326824e-06
2,2012-05-03,2012,Hy-Vee Food Store #5 / Des Moines,DES MOINES,50315,POINT (-93.602372 41.526289),Polk,SPICED RUM,Heaven Hill Brands,43026,Admiral Nelson Spiced Rum,12,5.77,8.66,444671,5.4e-05,0.000467,1.070454e-05
3,2012-08-09,2012,Fareway Stores #909 / Ankeny,ANKENY,50021,POINT (-93.59949600000002 41.702811),Polk,TENNESSEE WHISKIES,Brown-Forman Corporation,86670,Jack Daniel's Tennessee Honey,12,13.54,20.31,444671,2e-06,4.6e-05,4.497707e-07
4,2012-08-02,2012,Hy-Vee #3 / BDI / Des Moines,DES MOINES,50320,POINT (-93.596754 41.554101),Polk,DARK CREME DE CACAO,Luxco-St Louis,78456,Paramount Creme De Cacao/dark,12,4.65,6.97,444671,2.7e-05,0.000188,5.352272e-06


In [9]:
def k_fold(k):
    n = len(df)
    cut = int(n/k)
    folds = []
    start = 0
    end = cut
    for i in range(0, k):
        fold = df[start: end]
        folds.append(fold)
        start += int(n/k)
        end += int(n/k)
    return folds

folds = k_fold(5)

In [10]:
def mse(actual, predicted):
    return -(((actual - predicted)**2).mean())

In [19]:
def cv(folds, predictors, response):
    test_errors = []
    for i in range(0, len(folds)):
        X_train = pd.DataFrame()
        Y_train = pd.DataFrame()
        for j in range(0, len(folds)):
            if i == j:
                X_test = folds[j].drop([reponse], axis=1)
                Y_test = folds[j][response]
            if i != j:
                X_train = X_train.append(folds[j].drop([response], axis=1))
                Y_train = Y_train.append(folds[j][response])
    
        model = LR()
        model.fit(X_train, Y_train)
        Y_pred = pd.DataFrame()
        Y_pred = model.predict(X_test)
        error = mse(Y_test, Y_pred) 
        test_errors.append(error)
        errors = sum(test_errors)/ len(folds)
    return errors

# testing with bordeaux data set
# predictors = ["age","win"]
# response = ["summer"]
# cv(folds, predictors, response)


5
1
        Date  Year                  Store Name        City Zip Code  \
0 2012-10-04  2012  Kum & Go #573 / SE 14th DM  DES MOINES    50315   

  Store Location County Category Name Vendor Name  Item Number  \
0            NaN   Polk       TEQUILA     Proximo        89191   

                             Item Description  Pack  State Bottle Cost  \
0  Jose Cuervo Especial Reposado Tequila Mini    12               11.5   

   State Bottle Retail  Population  Bottles Sold Per Capita  \
0                17.25      444671                 0.000002   

   Alcohol Expense Per Capita  
0                    0.000039  
        Date  Year                  Store Name        City Zip Code  \
0 2012-10-04  2012  Kum & Go #573 / SE 14th DM  DES MOINES    50315   

  Store Location County Category Name Vendor Name  Item Number  \
0            NaN   Polk       TEQUILA     Proximo        89191   

                             Item Description  Pack  State Bottle Cost  \
0  Jose Cuervo Especial Reposa

In [12]:
# data_dir = "https://dlsun.github.io/pods/data/"
# bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
#                           index_col="year")
# bordeaux_train = bordeaux_df.loc[:1980].copy()
# bordeaux_train["log(price)"] = np.log(bordeaux_train["price"])