In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch as torch


In [2]:
# Import data

df = pd.read_csv('Housing.csv')
print(df.head(5))
print(f'Shape of the data (rows, columns): {df.shape}')

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  
Shape of the data (rows, columns): (545, 13)


In [4]:
# Data preprocessing

# Check for missing values
print("Missing values in the data: ")
print(df.isnull().sum())

# Convert categorical variables to numerical
mapping = {'yes': 1, 
           'no': 0,
           'furnished': 2,
           'semi-furnished': 1,
           'unfurnished': 0}

def map_value(value, mapping):
    if value in mapping:
        return mapping[value]
    else:
        return value
    
for col in df.columns:
    df[col] = df[col].apply(lambda x: map_value(x, mapping))

print(df.head())


Missing values in the data: 
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64
      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0       

In [21]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

# Split dataset into inputs (X) and outputs (y)
X = df.drop('price', axis=1).values
y = df['price'].values

# Do a train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# X_train = torch.FloatTensor(X_train)
# X_test = torch.FloatTensor(X_test)
# y_train = torch.FloatTensor(y_train)
# y_test = torch.FloatTensor(y_test)

# Normalize the inputs
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Example of normalized data
print(X_train[:2])

print(f'Shape of training set: {X_train.shape}')
print(f'Shape of test set: {X_test.shape}')


[[-0.51918156 -1.28450955 -0.56091916 -0.93244464  0.39946773 -0.45946829
  -0.72420682 -0.20751434 -0.68050514 -0.80209018 -0.54908336 -1.21435189]
 [-0.72191355 -1.28450955  1.47708713  0.21599     0.39946773 -0.45946829
   1.38082101 -0.20751434 -0.68050514  0.34450431 -0.54908336  1.41976465]]
Shape of training set: (436, 12)
Shape of test set: (109, 12)


In [25]:
from sklearn.linear_model import LinearRegression
import random

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

# Show a few examples of predictions vs actual values
for i in range(10):
    index = random.randint(0, len(predictions))
    print(f"""Predicted: {predictions[index]:.0f}, Target: {y_test[index]:.0f}, Difference: {predictions[index] - y_test[index]:.0f} Percent Diff: {(predictions[index] - y_test[index])/y_test[index]*100:.0f}%""")

rmse = np.sqrt(np.mean((predictions - y_test)**2))
print(f'\nRMSE: {rmse:.0f}')

Predicted: 5330341, Target: 5040000, Difference: 290341 Percent Diff: 6%
Predicted: 7031588, Target: 5390000, Difference: 1641588 Percent Diff: 30%
Predicted: 2975532, Target: 3290000, Difference: -314468 Percent Diff: -10%
Predicted: 3098153, Target: 4410000, Difference: -1311847 Percent Diff: -30%
Predicted: 6753658, Target: 6930000, Difference: -176342 Percent Diff: -3%
Predicted: 4065437, Target: 3500000, Difference: 565437 Percent Diff: 16%
Predicted: 2740867, Target: 2450000, Difference: 290867 Percent Diff: 12%
Predicted: 3537955, Target: 3234000, Difference: 303955 Percent Diff: 9%
Predicted: 2373088, Target: 3255000, Difference: -881912 Percent Diff: -27%
Predicted: 5618183, Target: 5460000, Difference: 158183 Percent Diff: 3%

RMSE: 1105985
