In [2]:
import pandas as pd
import numpy as np
from numpy.linalg import inv

In [3]:
# linear regression methodology
class LR:
    
    def fit(self, X_train, y_train):
        # create vector of ones...
        ones = np.ones(shape=len(X_train))[..., None]
        #...and add to feature matrix
        X = np.concatenate((ones, X_train), 1)
        #calculate coefficients using closed-form solution
        self.coeffs = inv(X.transpose().dot(X)).dot(X.transpose()).dot(y_train)
        
    def predict(self, X_test):
        ones = np.ones(shape=len(X_test))[..., None]
        X_test = np.concatenate((ones, X_test), 1)
        y_hat = X_test.dot(self.coeffs)
        return y_hat


In [4]:
# sklear linear regression example
from sklearn.linear_model import LinearRegression

data_dir = "https://dlsun.github.io/pods/data/"
bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
                          index_col="year")

bordeaux_train = bordeaux_df.loc[:1980].copy()
bordeaux_test = bordeaux_df.loc[1980:].copy()

X_train = bordeaux_train[["age"]]
X_test = bordeaux_test[["age"]]
y_train = bordeaux_train["price"]

model = LinearRegression()
model.fit(X=X_train, y=y_train)
model.predict(X=X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [5]:
# our linear regression example on same data
model = LR()
model.fit(X_train, y_train)
model.predict(X_test)

array([13.5724999 , 12.41648163, 11.26046336, 10.1044451 ,  8.94842683,
        7.79240856,  6.6363903 ,  5.48037203,  4.32435376,  3.1683355 ,
        2.01231723,  0.85629897])

In [6]:
# cross validation methodology
def k_fold(k, df):
    n = len(df)
    cut = int(n/k)
    folds = []
    start = 0
    end = cut
    for i in range(0, k):
        fold = df[start: end]
        folds.append(fold)
        start += int(n/k)
        end += int(n/k)
    return folds

def mse(actual, predicted):
    return -(((actual - predicted)**2).mean())

def cv(folds, response):
    test_errors = []
    for i in range(0, len(folds)):
        X_train = pd.DataFrame()
        Y_train = pd.Series(dtype=float)
        for j in range(0, len(folds)):
            if i == j:
                X_test = folds[j].drop([response], axis=1)
                Y_test = folds[j][response]
            if i != j:
                X_train = X_train.append(folds[j].drop([response], axis=1))
                Y_train = Y_train.append(folds[j][response])
    
        model = LR()
        model.fit(X_train, Y_train.transpose())
        Y_pred = pd.DataFrame()
        Y_pred = model.predict(X_test)
        error = mse(Y_test, Y_pred) 
        test_errors.append(error)
        rmse = np.sqrt(-sum(test_errors)/len(folds))
    return test_errors, rmse

In [7]:
# sklearn cross validation example
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

data_dir = "https://dlsun.github.io/pods/data/"
bordeaux_df = pd.read_csv(data_dir + "bordeaux.csv",
                          index_col="year")

bordeaux_train = bordeaux_df.loc[:1980].copy()
bordeaux_train["log(price)"] = np.log(bordeaux_train["price"])
bordeaux_train.drop(["price"], axis=1, inplace=True)

model = LinearRegression()
scores = cross_val_score(model, 
                         X=bordeaux_train[["win", "summer"]],
                         y=bordeaux_train["log(price)"],
                         scoring="neg_mean_squared_error",
                         cv=2)

print(scores)
print(np.sqrt(-scores).mean())

[-0.29898833 -0.27833086]
0.5371844859149083


In [8]:
# our cross validation on same data
bordeaux_train2 = bordeaux_train[["win", "summer", "log(price)"]]
folds = k_fold(2, bordeaux_train2)
errs, rmse = cv(folds, "log(price)")
print(errs)
print(rmse)


[-0.3261536583404905, -0.28766838705119696]
0.5539955078300217


In [9]:
# Standardizing the Dataframe

def standardize(df): 
    #standardize only quantitative variables
    df_st = ((df.select_dtypes(float) - df.select_dtypes(float).mean()) / df.select_dtypes(float).std()) 

    #join the standardized quantites back with original df 
    df_st = df.select_dtypes(exclude=float).join(df_st)
    return df_st 

In [43]:
df = pd.read_csv("iowa_month_county.csv") 
df.set_index(["County", "Month-Year"], inplace=True)

halfway = len(df)//2

df_train = df.iloc[:halfway].copy()
df_test = df.iloc[halfway:].copy()

In [44]:
y_train = df_train["Volume Sold (Gallons) Per Capita"]
X_train = df_train.drop(["Volume Sold (Gallons) Per Capita"], axis=1)

X_test = df_test.drop(["Volume Sold (Gallons) Per Capita"], axis=1)

X_train.corr()
# model = LR()
# model.fit(X_train, y_train)
# model.predict(X_test)

Unnamed: 0,Pack,State Bottle Cost,State Bottle Retail,Bottles Sold Per Capita,Alcohol Expense Per Capita,Category Name_100 PROOF VODKA,Category Name_AMERICAN ALCOHOL,Category Name_AMERICAN AMARETTO,Category Name_AMERICAN COCKTAILS,Category Name_AMERICAN DRY GINS,...,Vendor Name_Sazerac North America,Vendor Name_Shaw Ross International Importers LL,Vendor Name_Sidney Frank Importing Co.,Vendor Name_Stoli Group,Vendor Name_The Patron Spirits Company,"Vendor Name_WILLIAM GRANT AND SONS, INC.",Vendor Name_Western Spirits Beverage Co. LLC,Vendor Name_Wilson Daniels Ltd.,Population,Income Per Capita
Pack,1.000000,0.191290,0.191851,-0.071601,-0.099710,-0.033499,-0.013576,0.051362,0.031913,0.078510,...,0.023508,0.018893,-0.013576,0.018893,-0.004782,0.054258,,-0.079985,0.466042,-0.067161
State Bottle Cost,0.191290,1.000000,0.999986,-0.045797,0.011475,0.023648,-0.018042,-0.009389,-0.001689,-0.008295,...,-0.028500,0.081515,0.010211,0.060184,0.133083,0.084937,,0.036953,0.299281,-0.018499
State Bottle Retail,0.191851,0.999986,1.000000,-0.044854,0.012331,0.023489,-0.018088,-0.009440,-0.001526,-0.008488,...,-0.028673,0.081424,0.010163,0.060092,0.132914,0.084766,,0.036867,0.299179,-0.018518
Bottles Sold Per Capita,-0.071601,-0.045797,-0.044854,1.000000,0.982708,-0.000654,0.007626,-0.010214,-0.029335,-0.034124,...,0.006139,-0.009904,-0.005306,-0.011354,-0.011840,-0.021013,,-0.001178,-0.119990,0.123610
Alcohol Expense Per Capita,-0.099710,0.011475,0.012331,0.982708,1.000000,-0.011272,0.005924,-0.010317,-0.032605,-0.033786,...,-0.007918,-0.006748,-0.000383,-0.009583,0.003671,-0.016737,,0.016866,-0.107370,0.112195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Vendor Name_WILLIAM GRANT AND SONS, INC.",0.054258,0.084937,0.084766,-0.021013,-0.016737,-0.014451,-0.005857,-0.005857,-0.023160,-0.021496,...,-0.021496,-0.005857,-0.005857,-0.005857,-0.010174,1.000000,,-0.010174,0.025828,0.097945
Vendor Name_Western Spirits Beverage Co. LLC,,,,,,,,,,,...,,,,,,,,,,
Vendor Name_Wilson Daniels Ltd.,-0.079985,0.036953,0.036867,-0.001178,0.016866,-0.012497,-0.005064,-0.005064,-0.020028,-0.018588,...,-0.018588,-0.005064,-0.005064,-0.005064,-0.008798,-0.010174,,1.000000,-0.069322,-0.111053
Population,0.466042,0.299281,0.299179,-0.119990,-0.107370,0.012721,-0.038267,0.077335,0.054265,0.173325,...,0.042466,0.088638,-0.013809,0.158634,-0.024200,0.025828,,-0.069322,1.000000,-0.093313


In [45]:
model_sk = LinearRegression()
model.fit(X_train, y_train)


LinAlgError: Singular matrix