In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression
from implementacao import regressao_linear_gd
from implementacao import metrics
from importlib import reload

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
reload(regressao_linear_gd)
reload(metrics);

In [3]:
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv('housing.data', delim_whitespace = True, names = names)

In [4]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [5]:
data.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null int64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB


In [7]:
data["CHAS"] = data["CHAS"].astype('float64')
data["RAD"] = data["RAD"].astype('float64')

In [8]:
train, test = train_test_split(data, test_size = 0.3, random_state = 1)
train = train.copy()
test = test.copy()

models = []   
models.append(('LR', LinearRegression()))
models.append(('LR_GD', regressao_linear_gd.LinearRegressionGD()))
models.append(('SGD', SGDRegressor(max_iter=1000, tol=1e-3)))

In [9]:
x_train = train.drop(['MEDV'], axis=1)
y_train = train['MEDV']

x_test = test.drop('MEDV', axis=1)
y_test = test['MEDV']

In [10]:
x_train.shape, y_train.shape,x_test.shape, y_test.shape

((354, 13), (354,), (152, 13), (152,))

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
standard = StandardScaler()
normalizer = MinMaxScaler()

x_train_standard = standard.fit_transform(x_train)
x_test_standard = standard.transform(x_test)

x_train_normalizer = normalizer.fit_transform(x_train)
x_test_normalizer = normalizer.transform(x_test)

In [12]:
results = []
model_names = []
set_names = []
sets = {"Padrão" : (x_train, x_test), "Standardizado" : (x_train_standard, x_test_standard), "Normalizado": (x_train_normalizer, x_test_normalizer)}

for name, model in models:
    for set_name, (x1, x2) in sets.items():
        model.fit(x1, y_train)
        y_pred = model.predict(x2)
        rmse_score = metrics.rmse(y_test, y_pred)

        results.append(rmse_score)
        model_names.append(name)
        set_names.append(set_name)

In [13]:
result_df = pd.DataFrame({ 'Modelo': model_names, 'Conjunto':set_names, 'RMSE': results})
result_df = result_df.sort_values(by='RMSE', ascending=True)
result_df.head(10)

Unnamed: 0,Modelo,Conjunto,RMSE
1,LR,Standardizado,4.453237
2,LR,Normalizado,4.453237
0,LR,Padrão,4.453237
7,SGD,Standardizado,4.466548
8,SGD,Normalizado,4.466857
5,LR_GD,Normalizado,7.084474
4,LR_GD,Standardizado,222.3417
6,SGD,Padrão,110881600000000.0
3,LR_GD,Padrão,1.7913040000000002e+102
