In [1]:
#Importación de bibliotecas
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib as mpl
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

In [2]:
#Lectura de datos
train = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [3]:
#Haciendo la unión de las bases de test y training
y = train.SalePrice
data = pd.concat([train, test_data], keys=["train", "test"])

In [4]:
#Separando las variables por tipo
continuas = []
enteras = []
object = []
for feature in data.columns:
    if str(data[feature].dtype) == "float64":
        continuas.append(feature)
    elif str(data[feature].dtype) == "int64":
        enteras.append(feature)
    elif str(data[feature].dtype) == "object":
        object.append(feature)

continuas = continuas + ["LowQualFinSF",
                         "3SsnPorch","PoolArea","MiscVal","YrSold",
                         "FullBath","HalfBath","BedroomAbvGr",
           "KitchenAbvGr",
           "TotRmsAbvGrd",
                         ]

nominales = ["MSSubClass","MoSold",
             "MSZoning","Street","Alley","LotShape","LandContour",
             "Utilities", "LotConfig", "LandSlope", "Neighborhood",
             "Condition1", "Condition2","BldgType", "HouseStyle",
             "RoofStyle", "RoofMatl", "Exterior1st","Exterior2nd",
             "MasVnrType","Foundation","Heating", "Electrical",
             "Functional","GarageType","PavedDrive","Fence",
             "MiscFeature","SaleType","SaleCondition"
             ]
ordinal = ["OverallQual", "OverallCond",
           "Fireplaces",
           "ExterQual",
           "ExterCond",
           "BsmtQual",
           "BsmtCond",
           "BsmtExposure",
           "BsmtFinType2",
           "HeatingQC",
           "KitchenQual",
           "FireplaceQu",
           "GarageFinish",
           "GarageQual",
           "GarageCond",
           "PoolQC"]

dicotomicas = ["CentralAir",]

for x in  enteras:
    if data[x].nunique() < 50:
        print(x, data[x].nunique() )
print("********Object*********")
for x in  object:
    if data[x].nunique() < 50:
        print(x, data[x].nunique() )
print("********Features con 50 categorías o más*****")
for x in  enteras:
    if data[x].nunique() >= 50:
        print(x, data[x].nunique() )
        if x not in continuas:
            continuas.append(x)
            data[x] = pd.to_numeric(data[x])
continuas.remove("Id")
print("********Object*********")
for x in  object:
    if data[x].nunique() >= 50:
        print(x, data[x].nunique() )
        if x not in continuas:
            continuas.append(x)

print("En total hay", len(continuas + dicotomicas + nominales + ordinal) , "features")

MSSubClass 16
OverallQual 10
OverallCond 9
LowQualFinSF 36
FullBath 5
HalfBath 3
BedroomAbvGr 8
KitchenAbvGr 4
TotRmsAbvGrd 14
Fireplaces 5
3SsnPorch 31
PoolArea 14
MiscVal 38
MoSold 12
YrSold 5
********Object*********
MSZoning 5
Street 2
Alley 2
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 8
BldgType 5
HouseStyle 8
RoofStyle 6
RoofMatl 8
Exterior1st 15
Exterior2nd 16
MasVnrType 4
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 4
BsmtCond 4
BsmtExposure 4
BsmtFinType1 6
BsmtFinType2 6
Heating 6
HeatingQC 5
CentralAir 2
Electrical 5
KitchenQual 4
Functional 7
FireplaceQu 5
GarageType 6
GarageFinish 3
GarageQual 5
GarageCond 5
PavedDrive 3
PoolQC 3
Fence 4
MiscFeature 4
SaleType 9
SaleCondition 6
********Features con 50 categorías o más*****
Id 2919
LotArea 1951
YearBuilt 118
YearRemodAdd 61
1stFlrSF 1083
2ndFlrSF 635
GrLivArea 1292
WoodDeckSF 379
OpenPorchSF 252
EnclosedPorch 183
ScreenPorch 121
********Object*********
En total hay 7

In [5]:
#Recodificación de las categorías
data.FireplaceQu = data.FireplaceQu.fillna("NF")
data.Fence =data.Fence.fillna("NFe")
data.Alley = data.Alley.fillna("NoAlley")
data.MiscFeature = data.MiscFeature.fillna("NoMiscFeature")
data.PoolQC = data.PoolQC.fillna("NoP")
data.MasVnrType = data.MasVnrType.fillna("NoMasonry")
data.MasVnrArea =data.MasVnrArea.fillna(0)
data.BsmtQual = data.BsmtQual.fillna("NB") #Cambiar a nominal
data.BsmtCond = data.BsmtCond.fillna("NB") #Cambiar a nominal
data.BsmtExposure = data.BsmtExposure.fillna("NB") #Cambiar a nominales
data.BsmtFinType1 = data.BsmtFinType1.fillna("NB") #Cambiar a nominales
data.BsmtFinType2 = data.BsmtFinType2.fillna("NB") #Cambiar a nominal
#Con electrical el dato si está perdido, se sugiere perder esos registros
data.GarageType = data.GarageType.fillna("NG")



In [6]:
#Cálculo de porcentaje de perdidos
porcentajes = {}
for column in list(data.columns):
    porcentajes[column] = len(data[column][data[column].isna()==True]) / len(data[column]) *100
sorted(porcentajes.items(), key=lambda x:x[1])

[('Id', 0.0),
 ('MSSubClass', 0.0),
 ('LotArea', 0.0),
 ('Street', 0.0),
 ('Alley', 0.0),
 ('LotShape', 0.0),
 ('LandContour', 0.0),
 ('LotConfig', 0.0),
 ('LandSlope', 0.0),
 ('Neighborhood', 0.0),
 ('Condition1', 0.0),
 ('Condition2', 0.0),
 ('BldgType', 0.0),
 ('HouseStyle', 0.0),
 ('OverallQual', 0.0),
 ('OverallCond', 0.0),
 ('YearBuilt', 0.0),
 ('YearRemodAdd', 0.0),
 ('RoofStyle', 0.0),
 ('RoofMatl', 0.0),
 ('MasVnrType', 0.0),
 ('MasVnrArea', 0.0),
 ('ExterQual', 0.0),
 ('ExterCond', 0.0),
 ('Foundation', 0.0),
 ('BsmtQual', 0.0),
 ('BsmtCond', 0.0),
 ('BsmtExposure', 0.0),
 ('BsmtFinType1', 0.0),
 ('BsmtFinType2', 0.0),
 ('Heating', 0.0),
 ('HeatingQC', 0.0),
 ('CentralAir', 0.0),
 ('1stFlrSF', 0.0),
 ('2ndFlrSF', 0.0),
 ('LowQualFinSF', 0.0),
 ('GrLivArea', 0.0),
 ('FullBath', 0.0),
 ('HalfBath', 0.0),
 ('BedroomAbvGr', 0.0),
 ('KitchenAbvGr', 0.0),
 ('TotRmsAbvGrd', 0.0),
 ('Fireplaces', 0.0),
 ('FireplaceQu', 0.0),
 ('GarageType', 0.0),
 ('PavedDrive', 0.0),
 ('WoodDeckSF',

In [7]:
#Calculando los features que requieren imputacion
features_imputacion = ['LotFrontage']
for x in porcentajes:
    if porcentajes[x] <10 and porcentajes[x] >0:
        features_imputacion.append(x)
        print(x, porcentajes[x])

MSZoning 0.1370332305584104
Utilities 0.0685166152792052
Exterior1st 0.0342583076396026
Exterior2nd 0.0342583076396026
BsmtFinSF1 0.0342583076396026
BsmtFinSF2 0.0342583076396026
BsmtUnfSF 0.0342583076396026
TotalBsmtSF 0.0342583076396026
Electrical 0.0342583076396026
BsmtFullBath 0.0685166152792052
BsmtHalfBath 0.0685166152792052
KitchenQual 0.0342583076396026
Functional 0.0685166152792052
GarageYrBlt 5.4470709146968135
GarageFinish 5.4470709146968135
GarageCars 0.0342583076396026
GarageArea 0.0342583076396026
GarageQual 5.4470709146968135
GarageCond 5.4470709146968135
SaleType 0.0342583076396026


In [8]:
# Imputación
for x in features_imputacion:
    try:
        imputer = SimpleImputer(missing_values= np.nan , strategy="mean")
        imputer.fit(data[[x]])
        data[x] = imputer.transform(data[[x]])
    except:
        imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        imp.fit(data[[x]])
        data[x] = imp.transform(data[[x]])
    else:
        print("Se imputó con éxito la variable", x)

Se imputó con éxito la variable LotFrontage
Se imputó con éxito la variable BsmtFinSF1
Se imputó con éxito la variable BsmtFinSF2
Se imputó con éxito la variable BsmtUnfSF
Se imputó con éxito la variable TotalBsmtSF
Se imputó con éxito la variable BsmtFullBath
Se imputó con éxito la variable BsmtHalfBath
Se imputó con éxito la variable GarageYrBlt
Se imputó con éxito la variable GarageCars
Se imputó con éxito la variable GarageArea


In [9]:
#Revisando nuevamente los porcentajes
porcentajes = {}
for column in list(data.columns):
    porcentajes[column] = len(data[column][data[column].isna()==True]) / len(data[column]) *100
sorted(porcentajes.items(), key=lambda x:x[1])

[('Id', 0.0),
 ('MSSubClass', 0.0),
 ('MSZoning', 0.0),
 ('LotFrontage', 0.0),
 ('LotArea', 0.0),
 ('Street', 0.0),
 ('Alley', 0.0),
 ('LotShape', 0.0),
 ('LandContour', 0.0),
 ('Utilities', 0.0),
 ('LotConfig', 0.0),
 ('LandSlope', 0.0),
 ('Neighborhood', 0.0),
 ('Condition1', 0.0),
 ('Condition2', 0.0),
 ('BldgType', 0.0),
 ('HouseStyle', 0.0),
 ('OverallQual', 0.0),
 ('OverallCond', 0.0),
 ('YearBuilt', 0.0),
 ('YearRemodAdd', 0.0),
 ('RoofStyle', 0.0),
 ('RoofMatl', 0.0),
 ('Exterior1st', 0.0),
 ('Exterior2nd', 0.0),
 ('MasVnrType', 0.0),
 ('MasVnrArea', 0.0),
 ('ExterQual', 0.0),
 ('ExterCond', 0.0),
 ('Foundation', 0.0),
 ('BsmtQual', 0.0),
 ('BsmtCond', 0.0),
 ('BsmtExposure', 0.0),
 ('BsmtFinType1', 0.0),
 ('BsmtFinSF1', 0.0),
 ('BsmtFinType2', 0.0),
 ('BsmtFinSF2', 0.0),
 ('BsmtUnfSF', 0.0),
 ('TotalBsmtSF', 0.0),
 ('Heating', 0.0),
 ('HeatingQC', 0.0),
 ('CentralAir', 0.0),
 ('Electrical', 0.0),
 ('1stFlrSF', 0.0),
 ('2ndFlrSF', 0.0),
 ('LowQualFinSF', 0.0),
 ('GrLivArea', 0.

In [10]:
#Selección final de los features
final_features = ["BedroomAbvGr", "YrSold", "MoSold", "LotArea"]

In [11]:
#Datos de entrenamiento
X_train = data.loc["train"]
y = X_train.SalePrice
X_train = X_train[final_features]

In [12]:
X_train["LotArea"] = pd.to_numeric(X_train["LotArea"], downcast='float')


In [15]:
reg = LinearRegression()
reg.fit(X_train,y)



LinearRegression()