# Ejemplo de Regresión Lineal
Vamos a realizar la regresión lineal del conjunto de datos visto en la sesión anterior.

In [16]:
#Bibliotecas de análisis
import pandas as pd
#Bibliotecas para regresión
from sklearn.linear_model  import LinearRegression
from sklearn.model_selection import train_test_split
#Funciones estadísticas propias
from Funciones.Funciones import estadisticas
import statsmodels.api as sm

In [2]:
#Cargando los datos
train = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [3]:
#Observando los datos
train.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [4]:
#Juntando los datos de entrenamiento con los datos de testeo
data = pd.concat([train, test_data], keys=["train", "test"])

In [5]:
data.shape

(2919, 81)

In [6]:
#Escogiendo el subconjunto de variables a utilizar
#sample_submission.csv - a benchmark submission from a linear regression on year and month of sale, lot square footage, and number of bedrooms
final_features = ["BedroomAbvGr", "YrSold", "MoSold", "LotArea"]
data[final_features].info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2919 entries, ('train', 0) to ('test', 1458)
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   BedroomAbvGr  2919 non-null   int64
 1   YrSold        2919 non-null   int64
 2   MoSold        2919 non-null   int64
 3   LotArea       2919 non-null   int64
dtypes: int64(4)
memory usage: 143.5+ KB


In [7]:
#Crear la variable target
target = data.loc["train"].SalePrice
X_train = data.loc["train"][final_features]

In [8]:
#Creando datos de testeo
x_train, x_test, y_train, y_test   = train_test_split(X_train,target, test_size=0.2, random_state=5)

In [9]:
#Creación del modelo lineal
reg = LinearRegression()
reg.fit(x_train, y_train)

LinearRegression()

In [10]:
reg.coef_

array([ 1.30604130e+04, -9.99887645e+02,  1.55351210e+03,  1.75346489e+00])

In [11]:
#Haciendo las predicciones
X_test = data.loc["test"][final_features]
y_predicted = reg.predict(x_test)

In [24]:
reg.intercept_

2121866.203882918

In [13]:
#Score del modelo
estadisticas(y_predicted, y_test)

R-Square Value -23.065056246110565


mean_absolute_error : 53329.0350028078


mean_squared_error :  5886102184.571207


root_mean_squared_error :  76720.93706786438


In [14]:
estadisticas(y_test.tolist(),y_predicted)

R-Square Value 0.09305143296865448


mean_absolute_error : 53329.0350028078


mean_squared_error :  5886102184.571207


root_mean_squared_error :  76720.93706786438


In [20]:
#Regresión lineal versión estadística
X2 = sm.add_constant(x_train)
est = sm.OLS(y_train,X2)
est2 = est.fit()

In [22]:
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.088
Model:                            OLS   Adj. R-squared:                  0.085
Method:                 Least Squares   F-statistic:                     28.09
Date:                Wed, 02 Mar 2022   Prob (F-statistic):           2.68e-22
Time:                        18:21:16   Log-Likelihood:                -14777.
No. Observations:                1168   AIC:                         2.956e+04
Df Residuals:                    1163   BIC:                         2.959e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         2.122e+06   3.41e+06      0.622   