In [56]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from datetime import datetime

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib as mpl
import seaborn as sns

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, accuracy_score, f1_score

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [57]:
train = 'train.csv'
test = 'test.csv'

train_df = pd.read_csv(train)

test_df = pd.read_csv(test)

In [58]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [59]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [60]:
train_df = train_df[['Id','MSSubClass','MSZoning','LotArea','Street','LotShape','Utilities','Condition1','Condition2','BldgType','OverallQual','OverallCond','YearBuilt','YearRemodAdd','RoofMatl','ExterQual','Exterior1st','MasVnrType','ExterCond','BsmtCond','TotalBsmtSF','HeatingQC','CentralAir','Electrical','GrLivArea','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd','Functional','GarageArea','GarageQual','PavedDrive','PoolArea','MoSold','YrSold','SaleType','SaleCondition','SalePrice']]
test_df = test_df[['Id','MSSubClass','MSZoning','LotArea','Street','LotShape','Utilities','Condition1','Condition2','BldgType','OverallQual','OverallCond','YearBuilt','YearRemodAdd','RoofMatl','ExterQual','Exterior1st','MasVnrType','ExterCond','BsmtCond','TotalBsmtSF','HeatingQC','CentralAir','Electrical','GrLivArea','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd','Functional','GarageArea','GarageQual','PavedDrive','PoolArea','MoSold','YrSold','SaleType','SaleCondition']]

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 41 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             1460 non-null   int64 
 1   MSSubClass     1460 non-null   int64 
 2   MSZoning       1460 non-null   object
 3   LotArea        1460 non-null   int64 
 4   Street         1460 non-null   object
 5   LotShape       1460 non-null   object
 6   Utilities      1460 non-null   object
 7   Condition1     1460 non-null   object
 8   Condition2     1460 non-null   object
 9   BldgType       1460 non-null   object
 10  OverallQual    1460 non-null   int64 
 11  OverallCond    1460 non-null   int64 
 12  YearBuilt      1460 non-null   int64 
 13  YearRemodAdd   1460 non-null   int64 
 14  RoofMatl       1460 non-null   object
 15  ExterQual      1460 non-null   object
 16  Exterior1st    1460 non-null   object
 17  MasVnrType     1452 non-null   object
 18  ExterCond      1460 non-null

In [61]:
train_df.fillna('NA',inplace=True)
test_df.fillna('NA',inplace=True)

In [62]:
train_df.isnull().sum().sum()

0

In [63]:
num_df = train_df.select_dtypes(include=['float', 'int'])
obj_df = train_df.select_dtypes(include=['object'])

t_num_df = test_df.select_dtypes(include=['float', 'int'])
t_obj_df = test_df.select_dtypes(include=['object'])

In [66]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

variables_normalizadas = scaler.fit_transform(num_df)
t_variables_normalizadas = scaler.fit_transform(t_num_df)

norm_df = pd.DataFrame(variables_normalizadas, columns=num_df.columns)
t_norm_df = pd.DataFrame(t_variables_normalizadas, columns=t_num_df.columns)

print(norm_df)

            Id  MSSubClass   LotArea  OverallQual  OverallCond  YearBuilt  \
0     0.000000    0.235294  0.033420     0.666667        0.500   0.949275   
1     0.000685    0.000000  0.038795     0.555556        0.875   0.753623   
2     0.001371    0.235294  0.046507     0.666667        0.500   0.934783   
3     0.002056    0.294118  0.038561     0.666667        0.500   0.311594   
4     0.002742    0.235294  0.060576     0.777778        0.500   0.927536   
...        ...         ...       ...          ...          ...        ...   
1455  0.997258    0.235294  0.030929     0.555556        0.500   0.920290   
1456  0.997944    0.000000  0.055505     0.555556        0.625   0.768116   
1457  0.998629    0.294118  0.036187     0.666667        1.000   0.500000   
1458  0.999315    0.000000  0.039342     0.444444        0.625   0.565217   
1459  1.000000    0.000000  0.040370     0.444444        0.625   0.673913   

      YearRemodAdd  TotalBsmtSF  GrLivArea  FullBath  HalfBath  BedroomAbvG

In [67]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for columna in obj_df:
    obj_df[columna] = encoder.fit_transform(obj_df[columna])

for columna in obj_df:
    t_obj_df[columna] = encoder.fit_transform(t_obj_df[columna])

# Muestra el DataFrame con las variables codificadas
print(obj_df)

      MSZoning  Street  LotShape  Utilities  Condition1  Condition2  BldgType  \
0            3       1         3          0           2           2         0   
1            3       1         3          0           1           2         0   
2            3       1         0          0           2           2         0   
3            3       1         0          0           2           2         0   
4            3       1         0          0           2           2         0   
...        ...     ...       ...        ...         ...         ...       ...   
1455         3       1         3          0           2           2         0   
1456         3       1         3          0           2           2         0   
1457         3       1         3          0           2           2         0   
1458         3       1         3          0           2           2         0   
1459         3       1         3          0           2           2         0   

      RoofMatl  ExterQual  

In [77]:
train_df = pd.concat([obj_df, norm_df], axis=1)
test_df = pd.concat([t_obj_df, t_norm_df], axis=1)

In [85]:
X = train_df.drop(columns='SalePrice')
y = train_df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y.values.reshape(-1,1),
                                        train_size   = 0.7,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

In [86]:
from sklearn.linear_model import LinearRegression

modeloLR = LinearRegression()
modeloLR.fit(X_train, y_train)

y_train_pred = modeloLR.predict(X_train)
y_test_pred = modeloLR.predict(X_test)

print("Predicciones:", y_test_pred)

Predicciones: [0.26410316 0.19975022 0.26986419 ... 0.3194003  0.13538338 0.16111109]


In [90]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
r2 = r2_score(y_test, y_test_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R²:", r2)

MSE: 0.0015256668708445848
RMSE: 0.03905978585251825
R²: 0.8413577854854589


In [None]:
test_df.shape

(1459, 40)

In [83]:
X.shape

(1460, 40)