In [14]:
# a regularização é uma técnica para melhorar o desempenho em dados de teste
# especificamente, ela busca evitar o overfitting

import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [2]:
acs = pd.read_csv("acs_ny.csv")
acs.columns

Index(['Acres', 'FamilyIncome', 'FamilyType', 'NumBedrooms', 'NumChildren',
       'NumPeople', 'NumRooms', 'NumUnits', 'NumVehicles', 'NumWorkers',
       'OwnRent', 'YearBuilt', 'HouseCosts', 'ElectricBill', 'FoodStamp',
       'HeatingFuel', 'Insurance', 'Language'],
      dtype='object')

In [4]:
response, predictors = dmatrices("""FamilyIncome ~ NumBedrooms + NumChildren + NumPeople + NumRooms +
    NumUnits + NumVehicles + NumWorkers + OwnRent + YearBuilt + ElectricBill + FoodStamp + 
    HeatingFuel + Insurance + Language""", data = acs)

X_train, X_test, y_train, y_test = train_test_split(predictors, response, random_state = 0)

lr = LinearRegression(normalize = True).fit(X_train, y_train)

model_coefs = pd.DataFrame(list(zip(predictors.design_info.column_names, 
                                   lr.coef_[0])),
                          columns = ["variable", "coef_lr"])

model_coefs

Unnamed: 0,variable,coef_lr
0,Intercept,3.52266e-11
1,NumUnits[T.Single attached],31356.46
2,NumUnits[T.Single detached],24183.68
3,OwnRent[T.Outright],28391.86
4,OwnRent[T.Rented],7229.586
5,YearBuilt[T.1940-1949],12921.69
6,YearBuilt[T.1950-1959],20577.93
7,YearBuilt[T.1960-1969],17648.35
8,YearBuilt[T.1970-1979],17568.81
9,YearBuilt[T.1980-1989],25525.66


In [5]:
lr.score(X_train, y_train)

0.2726140465638568

In [6]:
lr.score(X_test, y_test)

0.26976979568488124

In [8]:
# regressão LASSO (Least Absolute Shrinkage and Selection Operator; regularização L1)
# alguns coeficientes se tornarão 0 e serão descartados
lasso = Lasso(normalize = True, random_state = 0).fit(X_test, y_test)

coefs_lasso = pd.DataFrame(list(zip(predictors.design_info.column_names, lasso.coef_)),
                          columns = ["variable", "coef_lasso"])

model_coefs = pd.merge(model_coefs, coefs_lasso, on = "variable")
model_coefs

Unnamed: 0,variable,coef_lr,coef_lasso
0,Intercept,3.52266e-11,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905
2,NumUnits[T.Single detached],24183.68,20278.620009
3,OwnRent[T.Outright],28391.86,30153.611697
4,OwnRent[T.Rented],7229.586,1440.140884
5,YearBuilt[T.1940-1949],12921.69,-6382.312453
6,YearBuilt[T.1950-1959],20577.93,-905.14203
7,YearBuilt[T.1960-1969],17648.35,-0.0
8,YearBuilt[T.1970-1979],17568.81,-1579.827129
9,YearBuilt[T.1980-1989],25525.66,7854.066748


In [9]:
lasso.score(X_train, y_train)

0.26670104659430227

In [11]:
lasso.score(X_test, y_test)

0.275062046386053

In [13]:
# regressão de ridge (regularização L2)
ridge = Ridge(normalize = True, random_state = 0).fit(X_train, y_train)

coefs_ridge = pd.DataFrame(list(zip(predictors.design_info.column_names, ridge.coef_[0])),
                          columns = ["variable", "coef_ridge"])

model_coefs = pd.merge(model_coefs, coefs_ridge, on = "variable")
model_coefs

Unnamed: 0,variable,coef_lr,coef_lasso,coef_ridge
0,Intercept,3.52266e-11,0.0,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905,4571.129321
2,NumUnits[T.Single detached],24183.68,20278.620009,4514.956813
3,OwnRent[T.Outright],28391.86,30153.611697,10674.890982
4,OwnRent[T.Rented],7229.586,1440.140884,-10180.631863
5,YearBuilt[T.1940-1949],12921.69,-6382.312453,-3672.096659
6,YearBuilt[T.1950-1959],20577.93,-905.14203,1221.61602
7,YearBuilt[T.1960-1969],17648.35,-0.0,-15.801437
8,YearBuilt[T.1970-1979],17568.81,-1579.827129,-1868.746915
9,YearBuilt[T.1980-1989],25525.66,7854.066748,2664.343363


In [15]:
ridge.score(X_train, y_train)

0.22808926982778677

In [16]:
ridge.score(X_test, y_test)

0.2325127754670412

In [17]:
# rede elástica (elastic net), combina as técnicas de ridge e LASSO
en = ElasticNet(random_state = 42).fit(X_train, y_train)

coefs_en = pd.DataFrame(list(zip(predictors.design_info.column_names, en.coef_)),
                       columns = ["variable", "coef_en"])

model_coefs = pd.merge(model_coefs, coefs_en, on = "variable")
model_coefs

Unnamed: 0,variable,coef_lr,coef_lasso,coef_ridge,coef_en
0,Intercept,3.52266e-11,0.0,0.0,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905,4571.129321,1342.291706
2,NumUnits[T.Single detached],24183.68,20278.620009,4514.956813,168.728479
3,OwnRent[T.Outright],28391.86,30153.611697,10674.890982,445.533238
4,OwnRent[T.Rented],7229.586,1440.140884,-10180.631863,-600.673747
5,YearBuilt[T.1940-1949],12921.69,-6382.312453,-3672.096659,-794.239494
6,YearBuilt[T.1950-1959],20577.93,-905.14203,1221.61602,513.289101
7,YearBuilt[T.1960-1969],17648.35,-0.0,-15.801437,-275.5762
8,YearBuilt[T.1970-1979],17568.81,-1579.827129,-1868.746915,-574.365605
9,YearBuilt[T.1980-1989],25525.66,7854.066748,2664.343363,708.813588
