# Predictive Modelling using MLR

BACKGROUND: 

The data for modeling contains information on Selling price of each house in million Rs. It also contains Carpet area in square feet, Distance from nearest metro station and Number of schools within 2 km distance. 

Step 1: Import House Price Data. Check the structure of the data

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy as sp
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import statsmodels.stats.proportion as ssp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
import patsy
from math import sqrt
import os
os.chdir(r"C:\Users\willi\GitHub\FPM_Assignment_PY\data\raw")

In [2]:
# Set Display Options

pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

Load Dataset

In [3]:
# Import Data
hse_price = pd.read_csv("House Price Data.csv")
print("✅ Dataset Loaded Successfully")
print("Data Shape:",hse_price.shape)
hse_price.head(5)

✅ Dataset Loaded Successfully
Data Shape: (198, 5)


Unnamed: 0,Houseid,Price,Area,Distance,Schools
0,1,24.74,1036,3.22,2
1,2,20.15,1030,4.33,3
2,3,25.98,1046,1.94,3
3,4,20.1,950,2.45,2
4,5,23.03,952,2.47,2


Clean the Data

In [4]:
# Drop Unnecessary Columns: Houseid is not required for analysis
hse_price.drop(columns=['Houseid'], inplace=True)
print("✅ Unnecessary Columns Dropped")

✅ Unnecessary Columns Dropped


In [8]:
# Check for Missing Values
print(hse_price.isnull().sum())
print("✅ Missing Values Checked")

price       0
area        0
distance    0
schools     0
dtype: int64
✅ Missing Values Checked


In [5]:
# Rename columns to lower case

hse_price.columns = hse_price.columns.str.lower()
print("✅ Column Names Lowercased")

✅ Column Names Lowercased


In [6]:
hse_price.head(10)

Unnamed: 0,price,area,distance,schools
0,24.74,1036,3.22,2
1,20.15,1030,4.33,3
2,25.98,1046,1.94,3
3,20.1,950,2.45,2
4,23.03,952,2.47,2
5,21.02,967,3.64,2
6,17.44,825,1.49,2
7,31.77,1162,2.26,3
8,27.6,1066,1.93,3
9,27.16,1084,1.47,2


In [9]:
# Save cleaned data to processed folder
hse_price.to_csv(r"C:\Users\willi\GitHub\FPM_Assignment_PY\data\processed\hse_price_cleaned.csv", index=False) 
print("✅ Cleaned Data Saved to Processed Folder") 

✅ Cleaned Data Saved to Processed Folder


Step 2: Split the data into Training (80%) and Testing (20%) data sets


In [10]:
# Import Libraries
import pandas as pd
import numpy as np  
from sklearn.model_selection import train_test_split
import os
os.chdir(r"C:\Users\willi\GitHub\FPM_Assignment_PY\data\processed")


In [11]:
# Load Cleaned Data
hse_prices_cleaned = pd.read_csv("hse_price_cleaned.csv")
print("✅ Dataset Loaded Successfully")

hse_prices_cleaned.head(10)

✅ Dataset Loaded Successfully


Unnamed: 0,price,area,distance,schools
0,24.74,1036,3.22,2
1,20.15,1030,4.33,3
2,25.98,1046,1.94,3
3,20.1,950,2.45,2
4,23.03,952,2.47,2
5,21.02,967,3.64,2
6,17.44,825,1.49,2
7,31.77,1162,2.26,3
8,27.6,1066,1.93,3
9,27.16,1084,1.47,2


In [12]:
# Perform Data Splitting
# Define the predictors and the target variable
X = hse_prices_cleaned.drop(columns=['price'])  
y = hse_prices_cleaned['price']

In [13]:
# 80/20 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Confirm shape of the splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (158, 3)
X_test shape: (40, 3)


In [15]:
# Save the splits to CSV files
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)    
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
print("✅ Data Splits Saved to: data/processed")



✅ Data Splits Saved to: data/processed
