In [401]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing datasets

In [402]:
dataset_train = pd.read_csv('train.csv')
dataset_test = pd.read_csv('test.csv')

In [403]:
# Retrieving basic information about the training set
dataset_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [404]:
dataset_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

# Data wrangling

In [405]:
# First, I select all the numeric columns and remove Id and SalePrice to keep them separated.
ds_train = dataset_train.select_dtypes(include=np.number)
ds_train = ds_train.drop('Id', axis=1)
ds_train = ds_train.drop('SalePrice', axis=1)
# For test set as well:
ds_test = dataset_test.select_dtypes(include=np.number)
ds_test = ds_test.drop('Id', axis=1)

# Separating the Id column as it will be needed as index for predictions.
# For consistency, we keep it as first column both in the training test
# and the test set even though the latter only will be needed.
id_train = dataset_train['Id']
id_test = dataset_test['Id']

# Separating the SalePrice column for order consistency when concatenating later.
sp_train = dataset_train['SalePrice']

# Identifying other columns that I will need for my model. I already identified
# what type of preprocessing they will need to go through.
train_columns_ordinal = dataset_train[['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual',\
                                      'GarageQual', 'GarageCond']]
train_columns_onehot = dataset_train[['Condition1']]
# For the test set as well:
test_columns_ordinal = dataset_test[['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual',\
                                      'GarageQual', 'GarageCond']]
test_columns_onehot = dataset_test[['Condition1']]

In [406]:
print(ds_train.head(2))

   MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0          60         65.0     8450            7            5       2003   
1          20         80.0     9600            6            8       1976   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  GarageArea  \
0          2003       196.0         706           0  ...         548   
1          1976         0.0         978           0  ...         460   

   WoodDeckSF  OpenPorchSF  EnclosedPorch  3SsnPorch  ScreenPorch  PoolArea  \
0           0           61              0          0            0         0   
1         298            0              0          0            0         0   

   MiscVal  MoSold  YrSold  
0        0       2    2008  
1        0       5    2007  

[2 rows x 36 columns]


## Preprocessing

Some columns have a quality scale, such as:

* Ex	Excellent
* Gd	Good
* TA	Typical/Average
* Fa	Fair
* Po	Poor
* NA	Not present

So first I convert this scale to numerical values and apply them accordingly.

In [407]:
# Creating a "scale conversion" dictionary
scale_conv = {"NA":0, "Po":1, "Fa":2, "TA":3, "Gd":4, "Ex":5}
# Applying the conversion to the ordinal columns
train_columns_ordinal = train_columns_ordinal.replace(scale_conv)
test_columns_ordinal = test_columns_ordinal.replace(scale_conv)

In [408]:
# Then, I apply the One Hot Encoding conversion to the columns 'Utilities', 'Condition1'and 'Condition2'
train_columns_onehot = pd.get_dummies(train_columns_onehot)
test_columns_onehot = pd.get_dummies(test_columns_onehot)

In [409]:
# Finally, I concatenate all the preprocessed dataframes into a single set
ds_train = pd.concat([id_train, train_columns_ordinal, train_columns_onehot, sp_train], axis=1)
ds_test = pd.concat([id_test, test_columns_ordinal, test_columns_onehot], axis=1)

## Handling missing data

In [410]:
ds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 1460 non-null   int64  
 1   ExterQual          1460 non-null   int64  
 2   ExterCond          1460 non-null   int64  
 3   BsmtQual           1423 non-null   float64
 4   BsmtCond           1423 non-null   float64
 5   HeatingQC          1460 non-null   int64  
 6   KitchenQual        1460 non-null   int64  
 7   GarageQual         1379 non-null   float64
 8   GarageCond         1379 non-null   float64
 9   Condition1_Artery  1460 non-null   uint8  
 10  Condition1_Feedr   1460 non-null   uint8  
 11  Condition1_Norm    1460 non-null   uint8  
 12  Condition1_PosA    1460 non-null   uint8  
 13  Condition1_PosN    1460 non-null   uint8  
 14  Condition1_RRAe    1460 non-null   uint8  
 15  Condition1_RRAn    1460 non-null   uint8  
 16  Condition1_RRNe    1460 

In [411]:
ds_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 1459 non-null   int64  
 1   ExterQual          1459 non-null   int64  
 2   ExterCond          1459 non-null   int64  
 3   BsmtQual           1415 non-null   float64
 4   BsmtCond           1414 non-null   float64
 5   HeatingQC          1459 non-null   int64  
 6   KitchenQual        1458 non-null   float64
 7   GarageQual         1381 non-null   float64
 8   GarageCond         1381 non-null   float64
 9   Condition1_Artery  1459 non-null   uint8  
 10  Condition1_Feedr   1459 non-null   uint8  
 11  Condition1_Norm    1459 non-null   uint8  
 12  Condition1_PosA    1459 non-null   uint8  
 13  Condition1_PosN    1459 non-null   uint8  
 14  Condition1_RRAe    1459 non-null   uint8  
 15  Condition1_RRAn    1459 non-null   uint8  
 16  Condition1_RRNe    1459 

In [412]:
# Filling missing values in columns with their respective mode (most frequent value)
cols = ['BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'GarageQual', 'GarageCond']
ds_train[cols] = ds_train[cols].fillna(ds_train.mode().iloc[0])
ds_test[cols] = ds_test[cols].fillna(ds_test.mode().iloc[0])

In [413]:
ds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 1460 non-null   int64  
 1   ExterQual          1460 non-null   int64  
 2   ExterCond          1460 non-null   int64  
 3   BsmtQual           1460 non-null   float64
 4   BsmtCond           1460 non-null   float64
 5   HeatingQC          1460 non-null   int64  
 6   KitchenQual        1460 non-null   int64  
 7   GarageQual         1460 non-null   float64
 8   GarageCond         1460 non-null   float64
 9   Condition1_Artery  1460 non-null   uint8  
 10  Condition1_Feedr   1460 non-null   uint8  
 11  Condition1_Norm    1460 non-null   uint8  
 12  Condition1_PosA    1460 non-null   uint8  
 13  Condition1_PosN    1460 non-null   uint8  
 14  Condition1_RRAe    1460 non-null   uint8  
 15  Condition1_RRAn    1460 non-null   uint8  
 16  Condition1_RRNe    1460 

In [414]:
ds_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 1459 non-null   int64  
 1   ExterQual          1459 non-null   int64  
 2   ExterCond          1459 non-null   int64  
 3   BsmtQual           1459 non-null   float64
 4   BsmtCond           1459 non-null   float64
 5   HeatingQC          1459 non-null   int64  
 6   KitchenQual        1459 non-null   float64
 7   GarageQual         1459 non-null   float64
 8   GarageCond         1459 non-null   float64
 9   Condition1_Artery  1459 non-null   uint8  
 10  Condition1_Feedr   1459 non-null   uint8  
 11  Condition1_Norm    1459 non-null   uint8  
 12  Condition1_PosA    1459 non-null   uint8  
 13  Condition1_PosN    1459 non-null   uint8  
 14  Condition1_RRAe    1459 non-null   uint8  
 15  Condition1_RRAn    1459 non-null   uint8  
 16  Condition1_RRNe    1459 

# Training the machine models

In [415]:
# Splitting between independent variables (X) and dependent variables (y)
X_train = ds_train.iloc[:, 1:-1].values
X_test = ds_test.iloc[:, 1:].values
y_train = ds_train.iloc[:, -1].values

In [416]:
print(y_train)

[208500 181500 223500 ... 266500 142125 147500]


## Decision tree

In [417]:
# Training the model, predicting the results
# This model has the highest score, 70% is still pretty low,
# I aim to improve it in further attempts
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(random_state=0)
tree_regressor.fit(X_train, y_train)
y_pred = tree_regressor.predict(X_test)

In [418]:
# Evaluating the model
print(tree_regressor.score(X_train, y_train))

0.7040935769880001


## Multiple linear regression

In [419]:
# Training the model
from sklearn.linear_model import LinearRegression
multi_regressor = LinearRegression()
multi_regressor.fit(X_train, y_train)

LinearRegression()

In [420]:
# Evaluating the model
print(multi_regressor.score(X_train, y_train))

0.5844671951061962


## Random forest

In [421]:
# Training the model
from sklearn.ensemble import RandomForestRegressor
forest_regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
forest_regressor.fit(X_train, y_train)

RandomForestRegressor(random_state=0)

In [422]:
# Evaluating the model
print(forest_regressor.score(X_train, y_train))

0.6921723296816802


# Generating the output

In [423]:
submission = pd.DataFrame({
    "Id": id_test,
    "SalePrice": y_pred
    })
# submission.to_csv('../output/submission.csv', index=False)