In [4]:
# Importing necessary libraries
import numpy as np
import pandas as pd

housing = pd.read_csv("https://raw.githubusercontent.com/Kaushik-Varma/mlr/main/Housing.csv")
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
# Checking for null values
print(housing.info())

# Checking for outliers
print(housing.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None
              price          area    bedrooms   bathrooms     stories  \
count  5.450000e+02    545.000000  545.000000  545.000000  545.000000   
mea

If we observe the dataset, there are numeric values and columns with values as ‘Yes’ or ‘No.’ But to fit a regression line, we need numeric values, so we’ll convert ‘Yes’ and ‘No’ as 1s and 0s.

In [6]:
# Converting the categorical variable into numerical
varlist =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# Applying the function to the housing list
housing[varlist] = housing[varlist].apply(binary_map)

# Check the housing dataframe now
housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,unfurnished
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,semi-furnished
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,unfurnished
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,furnished


The furnishingstatus column has three levels furnished, semi_furnished, and unfurnished.

We need to convert this column into numerical as well. To do that, we’ll use dummy variables.

When you have a categorical variable with n-levels, the idea of creating a dummy variable is to build ‘n-1’ variables, indicating the levels.

We can create a dummy variable using the get_dummies method in pandas.

In [7]:
# Creating dummy variable
status = pd.get_dummies(housing['furnishingstatus'])

# Check what the dataset 'status' looks like
status

Unnamed: 0,furnished,semi-furnished,unfurnished
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
540,0,0,1
541,0,1,0
542,0,0,1
543,1,0,0


In [8]:
# Dropping the first column from status dataset
status = pd.get_dummies(housing['furnishingstatus'], drop_first = True)

# Adding the status to the original housing dataframe
housing = pd.concat([housing, status], axis = 1)

# Dropping 'furnishingstatus' as we have created the dummies for it
housing.drop(['furnishingstatus'], axis = 1, inplace = True)

housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,0,1
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1,0
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,0,1
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0,0


In [19]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


Re-scaling the Features
We can see that all the columns have smaller integer values in the dataset except the area column. So it is important to re-scale the variables so that they all have a comparable scale. If we don’t have relative scales, then some of the regression model coefficients will be of different units compared to the other coefficients.

To do that, we use the MinMax scaling method.



In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Applying scaler() to all the columns except the 'yes-no' and 'dummy' variables
num_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking','price']
housing[num_vars] = scaler.fit_transform(housing[num_vars])

housing

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,1.000000,0.396564,0.6,0.333333,0.666667,1,0,0,0,1,0.666667,1,0,0
1,0.909091,0.502405,0.6,1.000000,1.000000,1,0,0,0,1,1.000000,0,0,0
2,0.909091,0.571134,0.4,0.333333,0.333333,1,0,1,0,0,0.666667,1,1,0
3,0.906061,0.402062,0.6,0.333333,0.333333,1,0,1,0,1,1.000000,1,0,0
4,0.836364,0.396564,0.6,0.000000,0.333333,1,1,1,0,1,0.666667,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,0.006061,0.092784,0.2,0.000000,0.000000,1,0,1,0,0,0.666667,0,0,1
541,0.001485,0.051546,0.4,0.000000,0.000000,0,0,0,0,0,0.000000,0,1,0
542,0.000000,0.135395,0.2,0.000000,0.000000,1,0,0,0,0,0.000000,0,0,1
543,0.000000,0.086598,0.4,0.000000,0.000000,0,0,0,0,0,0.000000,0,0,0


In [26]:
X = housing.drop('price', axis= 1)
y = housing['price']

In [None]:
X

In [28]:
from sklearn.model_selection import train_test_split

# We specify random seed so that the train and test data set always have the same rows, respectively
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = 100)

In [29]:
X_train

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
359,0.134021,0.4,0.000000,0.000000,1,0,0,0,0,0.333333,0,0,1
19,0.327835,0.4,0.333333,0.333333,1,0,0,0,1,0.333333,1,1,0
159,0.103093,0.4,0.333333,0.000000,1,1,1,0,1,0.000000,0,0,0
35,0.367698,0.4,0.333333,1.000000,1,0,0,0,1,0.666667,0,0,0
28,0.432990,0.8,0.333333,0.333333,1,0,1,1,0,0.666667,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,0.105155,0.2,0.000000,0.000000,1,0,0,0,0,0.000000,0,0,1
53,0.240550,0.4,0.333333,1.000000,1,0,0,0,1,0.666667,0,1,0
350,0.121649,0.2,0.000000,0.333333,1,0,0,1,0,0.333333,0,1,0
79,0.298969,0.4,0.333333,0.666667,1,1,0,0,1,0.000000,0,0,0


In [30]:
y_train

359    0.169697
19     0.615152
159    0.321212
35     0.548133
28     0.575758
         ...   
526    0.048485
53     0.484848
350    0.175758
79     0.424242
520    0.060606
Name: price, Length: 381, dtype: float64

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [32]:
model = LinearRegression()

In [33]:
model.fit(X_train, y_train)

In [34]:
y_pred = model.predict(X_test)

In [35]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

Mean Squared Error (MSE): The MSE measures the average squared difference between the predicted values and the actual values. It gives an indication of how close the predictions are to the true values. A lower MSE value indicates better model performance, with a value of 0 indicating a perfect fit.

R-squared (R2): The R-squared value measures the proportion of the variance in the dependent variable that is explained by the independent variables. It ranges between 0 and 1, where 0 indicates that the independent variables do not explain any of the variance, and 1 indicates a perfect fit where all the variance is explained. A higher R-squared value indicates better model performance.

In [38]:
# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Print the coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Mean Squared Error: 0.009624780513466769
R-squared: 0.6729582743459918
Coefficients: [ 0.30045358  0.04673453  0.28623478  0.10851563  0.05044144  0.03042826
  0.02159488  0.08486327  0.06688093  0.06073533  0.05942788  0.00092052
 -0.03100561]
Intercept: 0.016191992375918035
