In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from scipy.stats import linregress
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_poisson_deviance
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


random_state = 100
warnings.filterwarnings('ignore')

In [80]:
# Leemos los datos
data = pd.read_csv('BDpracticafinal.csv')
data.columns = data.columns.str.replace(" ","_")
# Escojemos solo las zona residenciales bajas, medias y altas
data = data[data.MS_Zoning.isin(['RH','RL','RM'])]

# Rellenamos valores missing
data['Lot_Frontage'] = data['Lot_Frontage'].fillna(0)
data['Mas_Vnr_Type']= data['Mas_Vnr_Type'].fillna('None')
data['Garage_Cars']= data['Garage_Cars'].fillna(0)
data['Garage_Area']= data['Garage_Area'].fillna(0)
data['Garage_Cond']= data['Garage_Cond'].fillna('Sg')
data = data.drop(columns='Order')

In [81]:
pendiente = {'Gtl':0, 'Mod':1, 'Sev':2}
calidad_sotano = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'Ss':0}
calidad_garage = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'Sg':0}
calidad_piscina = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'Sp':0}
calidad_general = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1}
quality_columns = ['Exter_Qual', 'Heating_QC', 'Kitchen_Qual']

data[quality_columns] = data[quality_columns].replace(calidad_general)
data['Land_Slope'] = data['Land_Slope'].replace(pendiente)
data['Bsmt_Cond'] = data['Bsmt_Cond'].replace(calidad_sotano)
data['Garage_Cond'] = data['Garage_Cond'].replace(calidad_garage)
data['Pool_QC'] = data['Pool_QC'].replace(calidad_piscina)
data = pd.get_dummies(data)

In [82]:
var_to_pred = 'Bedroom_AbvGr'

In [83]:
data_train, data_test = train_test_split(data, test_size = 0.3, random_state = random_state)

In [95]:
#2. mediante sklearn:
from sklearn.preprocessing import StandardScaler

# Esta segunda es más recomendable debido a que :
# - Lo hacemos en dos pasos. Con el primer paso calculamos los parámetros mean and std que quedarán embbebidos en el objeto scaler:
scaler = StandardScaler()
train_scaler = scaler.fit(data_train)

# - después normalizamos el conjunto de datos que queramos con dichos parámetros. Es necesario crear un dataframe nuevo ya que estos métodos
# devuelven un array numpy
data_train_norm = pd.DataFrame(index=data_train.index, columns=data_train.columns, data=train_scaler.transform(data_train))

In [96]:
X = data_train_norm.drop(columns=var_to_pred)
y = data_train_norm[var_to_pred]

In [97]:
lr = LinearRegression(normalize=False)

In [98]:
lr.fit(X,y)

LinearRegression()

In [99]:
lr.score(X,y)

0.5554851263906644

In [100]:
data_test_norm = pd.DataFrame(index=data_test.index,  columns=data_test.columns, data=train_scaler.transform(data_test))
Xtest = data_test_norm.drop(columns=var_to_pred)
ytest = data_test_norm[var_to_pred]

In [101]:
predictions = lr.predict(Xtest)

In [102]:
lr.score(Xtest,ytest)

0.5489930211019133

In [103]:
predicted_data_norm = data_test_norm.copy()
predicted_data_norm[var_to_pred] = predictions

In [104]:
predicted_data_unorm = pd.DataFrame(index=predicted_data_norm.index, columns=predicted_data_norm.columns, data=train_scaler.inverse_transform(predicted_data_norm))

In [105]:
predicted_data_unorm[var_to_pred].head()

1490    3.100668
2156    1.189870
2530    2.905254
1599    3.052293
1508    2.798935
Name: Bedroom_AbvGr, dtype: float64

In [106]:
data_test[var_to_pred].head()

1490    3
2156    1
2530    3
1599    3
1508    3
Name: Bedroom_AbvGr, dtype: int64

In [107]:
predicted_data_unorm['predicted'] = predicted_data_unorm[var_to_pred]
predicted_data_unorm[var_to_pred] = data_test[var_to_pred]

In [108]:
predicted_data_unorm[[var_to_pred,'predicted']].head(14)

Unnamed: 0,Bedroom_AbvGr,predicted
1490,3,3.100668
2156,1,1.18987
2530,3,2.905254
1599,3,3.052293
1508,3,2.798935
802,3,2.801101
2816,3,2.980373
1713,3,2.766752
334,3,2.836557
1538,3,3.095329


In [109]:
mean_squared_error(predicted_data_unorm[var_to_pred], predicted_data_unorm['predicted'], squared=False)

0.5679560969957662

In [110]:
mean_poisson_deviance(predicted_data_unorm[var_to_pred], predicted_data_unorm['predicted'])

0.13601836799057423

# Poisson Regression #

In [111]:
data_test_poisson = data_test.copy()
data_training_poisson = data_train.copy()

In [112]:
X_poisson = data_training_poisson.drop(columns=var_to_pred)
y_poisson = data_training_poisson[var_to_pred]

In [113]:

poisson_scaler = StandardScaler()
train_poisson_scaler = poisson_scaler.fit(X_poisson)

X_poisson_norm = pd.DataFrame(index=X_poisson.index, columns=X_poisson.columns, data=train_poisson_scaler.transform(X_poisson))

In [114]:
from sklearn.linear_model import PoissonRegressor

In [115]:
pr = PoissonRegressor()
pr.fit(X_poisson_norm, y_poisson)

PoissonRegressor()

In [116]:
accuracy = pr.score(X_poisson_norm, y_poisson)
accuracy

0.48434420320271965

In [117]:
Xtest_poisson = data_test_poisson.drop(columns=var_to_pred)
ytest_poisson = data_test_poisson[var_to_pred]
Xtest_poisson_norm = pd.DataFrame(index=Xtest_poisson.index,  columns=Xtest_poisson.columns, data=train_poisson_scaler.transform(Xtest_poisson))



In [118]:
pr.score(Xtest_poisson_norm, ytest_poisson)

0.43379147324448797

In [119]:
predictions = pr.predict(Xtest_poisson_norm)

In [122]:
predictions

array([3.01735067, 1.86900968, 2.94421885, 2.93138319, 2.74847584,
       2.85637334, 2.98004025, 2.75870311, 2.80204574, 3.09210824,
       2.4975372 , 2.99234773, 2.73889538, 2.74959646, 2.76906666,
       3.22221086, 3.942886  , 3.54865843, 2.52469845, 2.58810734,
       2.86525508, 2.91376545, 4.8965373 , 1.98931489, 2.7169204 ,
       2.77469242, 2.73083286, 3.34027224, 3.678363  , 3.34285739,
       2.57889337, 3.12165486, 3.44788109, 2.74685416, 2.9776927 ,
       2.78088665, 3.26465036, 2.82477071, 2.47528418, 2.53896755,
       2.49874981, 2.67704957, 3.16674045, 2.83114594, 2.9770135 ,
       2.58909507, 3.31676482, 2.65987135, 2.94982821, 2.8228127 ,
       2.66618818, 2.61535609, 2.60587788, 3.02768306, 2.36609909,
       3.09517209, 2.80458856, 2.57325514, 3.05140941, 2.52498442,
       2.41327857, 2.82905461, 2.07120743, 2.80778863, 3.06991581,
       3.04055754, 2.80334471, 2.00367671, 2.60072281, 2.33549798,
       2.8183122 , 2.61331605, 3.58677754, 3.022859  , 4.07771

In [120]:
mean_squared_error(ytest_poisson, predictions, squared=False)

0.6060025468350156

In [121]:
mean_poisson_deviance(ytest_poisson, predictions)

0.1567373868651299