## Import packages

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
# Fixed variables
RANDOM_STATE = 12

# Path variables
TRAIN_CSV_PATH = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
TEST_CSV_PATH = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

## Data preprocessing
In this section, we will perform the following preprocessing techniques to the training dataset:
1. Removing irrelevant columns such as null-valued columns, IDs, etc.
2. Dealing with categorical data using one-hot encoding.

In [3]:
df = pd.read_csv(TRAIN_CSV_PATH)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
def preprocess_dataset(dataset):
    # Removing irrelevant columns (e.g. null-valued columns, Id, etc.)
    drop_columns = ['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature']
    dataset = dataset.drop(columns=drop_columns)
    
    # Select only int64 datatypes
    dataset = dataset.select_dtypes(include=['int64', 'float64'])
    dataset = dataset.fillna(0)
    return dataset

In [6]:
train_df = preprocess_dataset(dataset=df)
print(train_df.shape)
test_df = preprocess_dataset(dataset=pd.read_csv(TEST_CSV_PATH))
print(test_df.shape)

(1460, 37)
(1459, 36)


In [7]:
train_df.isna().sum()

MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

## Partitioning the dataset

In [8]:
X = train_df.drop(columns=['SalePrice'])
Y = train_df['SalePrice']

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=RANDOM_STATE)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((1168, 36), (292, 36), (1168,), (292,))

## Model training

In [10]:
lr_model = LinearRegression().fit(X_train, Y_train)
predictions = lr_model.predict(X_test)

In [11]:
regression_r2 = r2_score(Y_test, predictions)
print(f"Regression model R-squared: {regression_r2}")

Regression model R-squared: 0.7859425482749017


## Predicting the test set

In [12]:
test_df = pd.read_csv(TEST_CSV_PATH)

Create a dataframe for the submission

In [13]:
submission_df = test_df[['Id']]

Preprocess and get model outputs on test_df

In [14]:
test_df = preprocess_dataset(test_df)
test_predictions = lr_model.predict(test_df)

## Create submission

In [15]:
submission_df['SalePrice'] = test_predictions
submission_df = submission_df.set_index('Id')

In [16]:
submission = submission_df.to_csv("house_prices_regression_predictions_lr_model.csv", encoding='utf-8')