# Regressão

Técnica estatistica de modelagem que consiste no relacionamento entre variaveis dependentes e independentes. <br>
Muito ulitilazada para análises preditivas e de series temporais e para amostras ruidosas de dados.

É a utilização de uma reta que estima o melhor comportamento dos dados e minimiza o somatorio total dos erros de cada elemento sobre a reta.


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [115]:
df = pd.read_csv('https://s3.amazonaws.com/pycourse/50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [0]:
#Extrair as matrizes X e y para regrassão do dataset

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) #random embaralha #stratify mantem proporcional a divisão das labels

In [0]:
#Normalizando os dados
from sklearn.preprocessing import StandardScaler

In [119]:
sc = StandardScaler()
X_train.shape

(40, 4)

In [120]:
X_train[:, :-1] = sc.fit_transform(X_train[:, :-1])  #Fit é somente para treino na base de dados de treino, aplica em todas as colunas exeto a ultima coluna que é uma str
X_test[:, :-1] = sc.transform(X_test[:, :-1]) # Aplica a formula nos dados
print(X_train[:5, :])

[[-0.3500645436227844 -0.7854710924793271 0.1011968019362538 'Florida']
 [-0.555303187426314 -1.481174262628151 0.02734979174277092 'New York']
 [0.07935762307586282 0.8013338146656704 -0.551521323997471 'Florida']
 [-0.5463823849331263 1.3250581707161837 0.07011683779235604 'California']
 [0.4348537132854595 -0.3559866348200946 0.7514851578736048 'Florida']]


In [0]:
#Tratamento de variaveis categoricas -one-hot-encoding

from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [122]:
le_X = LabelEncoder() #converte as coluna str dos paiseis em int
X_train[:, -1] = le_X.fit_transform(X_train[: ,-1])
X_test[:, -1] = le_X.fit_transform(X_test[: ,-1])
print(X_train[:5, :])

[[-0.3500645436227844 -0.7854710924793271 0.1011968019362538 1]
 [-0.555303187426314 -1.481174262628151 0.02734979174277092 2]
 [0.07935762307586282 0.8013338146656704 -0.551521323997471 1]
 [-0.5463823849331263 1.3250581707161837 0.07011683779235604 0]
 [0.4348537132854595 -0.3559866348200946 0.7514851578736048 1]]


In [123]:
#Aplicando o one-hote encoding

ohe = OneHotEncoder(categorical_features=[3])
X_train = ohe.fit_transform(X_train).toarray()
X_test = ohe.transform(X_test).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [124]:
print(X_train[:5,:])

[[ 0.          1.          0.         -0.35006454 -0.78547109  0.1011968 ]
 [ 0.          0.          1.         -0.55530319 -1.48117426  0.02734979]
 [ 0.          1.          0.          0.07935762  0.80133381 -0.55152132]
 [ 1.          0.          0.         -0.54638238  1.32505817  0.07011684]
 [ 0.          1.          0.          0.43485371 -0.35598663  0.75148516]]


In [0]:
#evitando Dummy varible trap
X_train = X_train[:,1:]
X_test = X_test[:,1:]

In [126]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [0]:
y_pred = regressor.predict(X_test)

In [128]:
#calculo rmse

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print(rmse)

9137.990152794953


In [129]:
rmse_baseline = np.sqrt(np.mean((y_test - y_test.mean())**2))
print(rmse_baseline)

35761.59382638142


In [130]:
rmse_baseline / rmse

3.9135075906646

In [131]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print('R2 = ', r2)

R2 =  0.9347068473282424


In [132]:
#adicionando termo constante

from statsmodels.tools import add_constant
X_train = add_constant(X_train)
print(X_train[:5,:])

[[ 1.          1.          0.         -0.35006454 -0.78547109  0.1011968 ]
 [ 1.          0.          1.         -0.55530319 -1.48117426  0.02734979]
 [ 1.          1.          0.          0.07935762  0.80133381 -0.55152132]
 [ 1.          0.          0.         -0.54638238  1.32505817  0.07011684]
 [ 1.          1.          0.          0.43485371 -0.35598663  0.75148516]]


In [0]:
#teste de hipotese
import statsmodels.regression.linear_model as sm

In [134]:
#primeira interacao do nbackward elimination

X_opt = X_train[:, [0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,129.7
Date:,"Thu, 18 Jul 2019",Prob (F-statistic):,3.91e-21
Time:,15:35:44,Log-Likelihood:,-421.1
No. Observations:,40,AIC:,854.2
Df Residuals:,34,BIC:,864.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.094e+05,2471.900,44.274,0.000,1.04e+05,1.14e+05
x1,-959.2842,4038.108,-0.238,0.814,-9165.706,7247.138
x2,699.3691,3661.563,0.191,0.850,-6741.822,8140.560
x3,3.573e+04,2547.324,14.025,0.000,3.05e+04,4.09e+04
x4,851.3016,1720.699,0.495,0.624,-2645.580,4348.183
x5,4519.8828,2398.520,1.884,0.068,-354.495,9394.261

0,1,2,3
Omnibus:,15.823,Durbin-Watson:,2.468
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.231
Skew:,-1.094,Prob(JB):,9.03e-06
Kurtosis:,6.025,Cond. No.,4.39


In [135]:
X_opt = X_train[:, [0,1,3,4,5]]
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,166.7
Date:,"Thu, 18 Jul 2019",Prob (F-statistic):,2.87e-22
Time:,15:35:44,Log-Likelihood:,-421.12
No. Observations:,40,AIC:,852.2
Df Residuals:,35,BIC:,860.7
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.098e+05,1777.949,61.737,0.000,1.06e+05,1.13e+05
x1,-1272.1608,3639.780,-0.350,0.729,-8661.308,6116.986
x2,3.581e+04,2470.255,14.498,0.000,3.08e+04,4.08e+04
x3,825.0471,1691.426,0.488,0.629,-2608.731,4258.825
x4,4484.9477,2358.387,1.902,0.065,-302.833,9272.729

0,1,2,3
Omnibus:,16.074,Durbin-Watson:,2.467
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.553
Skew:,-1.086,Prob(JB):,4.66e-06
Kurtosis:,6.164,Cond. No.,3.53


In [136]:
X_opt = X_train[:, [0,3,4,5]]
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,227.8
Date:,"Thu, 18 Jul 2019",Prob (F-statistic):,1.8499999999999998e-23
Time:,15:35:44,Log-Likelihood:,-421.19
No. Observations:,40,AIC:,850.4
Df Residuals:,36,BIC:,857.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.094e+05,1508.711,72.543,0.000,1.06e+05,1.13e+05
x1,3.597e+04,2397.760,15.003,0.000,3.11e+04,4.08e+04
x2,760.8842,1660.808,0.458,0.650,-2607.390,4129.159
x3,4285.3366,2260.123,1.896,0.066,-298.405,8869.078

0,1,2,3
Omnibus:,15.557,Durbin-Watson:,2.481
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.539
Skew:,-1.081,Prob(JB):,1.28e-05
Kurtosis:,5.974,Cond. No.,2.83


In [137]:
X_opt = X_train[:, [0,1,3,5]]
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,227.0
Date:,"Thu, 18 Jul 2019",Prob (F-statistic):,1.9599999999999997e-23
Time:,15:35:44,Log-Likelihood:,-421.25
No. Observations:,40,AIC:,850.5
Df Residuals:,36,BIC:,857.3
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.097e+05,1756.314,62.470,0.000,1.06e+05,1.13e+05
x1,-1079.4682,3579.778,-0.302,0.765,-8339.594,6180.658
x2,3.632e+04,2216.021,16.391,0.000,3.18e+04,4.08e+04
x3,4172.4461,2245.551,1.858,0.071,-381.742,8726.634

0,1,2,3
Omnibus:,15.078,Durbin-Watson:,2.51
Prob(Omnibus):,0.001,Jarque-Bera (JB):,22.142
Skew:,-1.032,Prob(JB):,1.56e-05
Kurtosis:,6.004,Cond. No.,3.32


In [138]:
X_opt = X_train[:, [0,3]]
regressor_OLS = sm.OLS(endog=y_train, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.945
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,652.4
Date:,"Thu, 18 Jul 2019",Prob (F-statistic):,1.56e-25
Time:,15:35:45,Log-Likelihood:,-423.09
No. Observations:,40,AIC:,850.2
Df Residuals:,38,BIC:,853.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.094e+05,1540.062,71.066,0.000,1.06e+05,1.13e+05
x1,3.934e+04,1540.062,25.542,0.000,3.62e+04,4.25e+04

0,1,2,3
Omnibus:,13.132,Durbin-Watson:,2.325
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.254
Skew:,-0.991,Prob(JB):,0.000295
Kurtosis:,5.413,Cond. No.,1.0


In [142]:
X_test = X_test[:, [2,4]]
X_train = X_train [:, [3, 5]]

IndexError: ignored

In [143]:
X_train.shape[1] == X_test.shape[1]

True

In [144]:
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)


ValueError: ignored