In [114]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings ('ignore')

In [115]:
train_file ='D:\jupyter notebook\housing_train.csv'
test_file ='D:\jupyter notebook\housing_test.csv'
ld_train=pd.read_csv(train_file) #historical data
ld_pridict=pd.read_csv(test_file) # pridiction data

In [116]:
ld_train.shape

(7536, 16)

In [117]:
ld_train.head(10)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Brunswick,52 Evans St,3,h,1650000,S,Nelson,5.2,3056,3.0,1.0,2.0,495.0,141.0,1920.0,Moreland
1,Reservoir,85 Radford Rd,5,h,791000,S,Ray,11.2,3073,4.0,3.0,1.0,961.0,,,Darebin
2,Newport,99 Anderson St,3,h,785000,S,RT,8.4,3015,3.0,1.0,1.0,185.0,,,Hobsons Bay
3,Brighton East,4/377 South Rd,2,u,755000,SP,Buxton,10.7,3187,,,,,,,
4,Hawthorn East,3 Jaques St,5,h,2500000,VB,RT,7.5,3123,5.0,3.0,3.0,757.0,240.0,1925.0,Boroondara
5,Hawthorn East,75 Leura Gr,3,h,3020000,S,Hooper,7.5,3123,3.0,2.0,2.0,832.0,,,Boroondara
6,Bentleigh East,4 Adrian St,3,h,780000,VB,hockingstuart,13.9,3165,3.0,1.0,1.0,710.0,,1966.0,Glen Eira
7,Surrey Hills,47 Suffolk Rd,3,h,2200000,VB,Fletchers,11.2,3127,3.0,2.0,4.0,816.0,,,Boroondara
8,Keilor East,37a Heather Av,4,h,760000,PI,Nelson,12.8,3033,,,,,,,
9,Mont Albert,50/781 Whitehorse Rd,2,u,750000,S,RW,11.8,3127,2.0,2.0,2.0,0.0,80.0,2003.0,Whitehorse


In [118]:
ld_train.describe

<bound method NDFrame.describe of               Suburb            Address  Rooms Type    Price Method  \
0          Brunswick        52 Evans St      3    h  1650000      S   
1          Reservoir      85 Radford Rd      5    h   791000      S   
2            Newport     99 Anderson St      3    h   785000      S   
3      Brighton East     4/377 South Rd      2    u   755000     SP   
4      Hawthorn East        3 Jaques St      5    h  2500000     VB   
...              ...                ...    ...  ...      ...    ...   
7531       Footscray   202/51 Gordon St      1    u    85000     PI   
7532        Oak Park      63 Vincent St      3    h   900000     SP   
7533      Camberwell         5 Gowar Av      4    h  2285000     PI   
7534        Richmond     29a Abinger St      3    h  1185000      S   
7535  Brunswick West  12/82 Hopetoun Av      2    u   400000     VB   

            SellerG  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
0            Nelson       5.2      

In [119]:
ld_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7536 entries, 0 to 7535
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Suburb        7536 non-null   object 
 1   Address       7536 non-null   object 
 2   Rooms         7536 non-null   int64  
 3   Type          7536 non-null   object 
 4   Price         7536 non-null   int64  
 5   Method        7536 non-null   object 
 6   SellerG       7536 non-null   object 
 7   Distance      7536 non-null   float64
 8   Postcode      7536 non-null   int64  
 9   Bedroom2      5977 non-null   float64
 10  Bathroom      5977 non-null   float64
 11  Car           5977 non-null   float64
 12  Landsize      5972 non-null   float64
 13  BuildingArea  3327 non-null   float64
 14  YearBuilt     3819 non-null   float64
 15  CouncilArea   5972 non-null   object 
dtypes: float64(7), int64(3), object(6)
memory usage: 942.1+ KB


In [120]:
# Get all numeric columns
num_vars = list(ld_train.select_dtypes(exclude=['object']).columns)
num_vars

['Rooms',
 'Price',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt']

In [121]:
# Get all categorcial columns
cat_vars = list(ld_train.select_dtypes(include=['object']).columns)
cat_vars

['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'CouncilArea']

# Feature engineering

In [122]:
#  Property Age
ld_train['Property_Age'] = 2025 - ld_train['YearBuilt']


In [123]:
# Total Rooms (Alternative to Bedroom2)
ld_train['Total_Rooms'] = ld_train['Rooms'] + ld_train['Bedroom2'] + ld_train['Bathroom']


In [124]:
# Rooms per Square Meter
ld_train['Rooms_per_sqm'] = ld_train['Total_Rooms'] / (ld_train['BuildingArea'] + 1)  # Avoid division by zero


In [125]:
#  Land Efficiency Ratio
ld_train['Land_Usage'] = ld_train['BuildingArea'] / (ld_train['Landsize'] + 1)  # Avoid division by zero


In [126]:
# Get all numeric columns
num_vars = list(ld_train.select_dtypes(exclude=['object']).columns)
num_vars

['Rooms',
 'Price',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Property_Age',
 'Total_Rooms',
 'Rooms_per_sqm',
 'Land_Usage']

In [127]:
# Get all categorcial columns
cat_vars = list(ld_train.select_dtypes(include=['object']).columns)
cat_vars

['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'CouncilArea']

# Train-Test Split

In [128]:
# First get the dependent/ target variable
target = ld_train['Price']

In [129]:
#Perform an 80-20 split
ld_train, ld_test, y_train, y_test = train_test_split(ld_train, target, test_size=0.2, random_state=42)

In [130]:
print(ld_train.shape)
print(y_train.shape)
print(ld_test.shape)
print(y_test.shape)

(6028, 20)
(6028,)
(1508, 20)
(1508,)


In [131]:
# Find the category count for suburb & SellerG & Method &Type & CouncilArea column
k = ld_train["Address"].value_counts()
k #
ld_train["Address"].nunique()

5981

In [132]:
# Check how many categories are less than some predetermined frequency check (lets say 20)
freq_cutoff = 20
print(k<=freq_cutoff)
print("==========================")
print("Categories with low frequency: ", (k<=freq_cutoff).sum()) # categories less than 20

Address
5 Margaret St     True
70 Park St        True
14 Latrobe St     True
9/99 Barton St    True
443 Punt Rd       True
                  ... 
3 Monomeath Pl    True
53 Ryan St        True
8 Bowmore St      True
56 Rose St        True
9 Ernst St        True
Name: count, Length: 5981, dtype: bool
Categories with low frequency:  5981


In [133]:
#from the aadress i am just extracting street name
# Extract only the street name (everything between number and street type)
ld_train['StreetName'] = ld_train['Address'].str.extract(r'\d+(?:/\d+)?\s+([\w\s]+)')
print(ld_train.head(10)
)

            Suburb               Address  Rooms Type    Price Method  \
380   Balwyn North           9 Gildan St      3    h  2000000     VB   
5653  Altona North        10 Freemans St      3    h   750000     VB   
2273   South Yarra      22/382 Toorak Rd      2    u   601000      S   
2098     Melbourne   702/598 St Kilda Rd      3    u  1525000      S   
599         Altona         219 Civic Pde      3    h   740000      S   
45        St Kilda          9 Octavia St      2    h  1100000     VB   
4823     Fairfield  20/262 Heidelberg Rd      2    u   602000      S   
4801       Ivanhoe           32 Ailsa Gr      3    h  1850000      S   
5700       Glenroy        2/67 Morell St      3    t   550000     SP   
4336     Braybrook        20 Balmoral St      3    h   730000      S   

       SellerG  Distance  Postcode  Bedroom2  ...  Car  Landsize  \
380        Kay       9.2      3104       3.0  ...  2.0   75100.0   
5653      Greg      11.1      3025       NaN  ...  NaN       NaN   
227

In [134]:
ld_train.drop(columns=['Address'], inplace=True) #inplace true means drop it in main DF or false means it gives

In [135]:
pd.to_numeric(ld_train["Suburb"], errors = "coerce").isnull().sum()

6028

# Applying pipelines

In [136]:
for col in ld_train.columns:
    print(f"Column: {col}")
    print(ld_train[col].apply(type).value_counts())
    print("-" * 40)
#CouncilArea and StreetName dontains both str and float so
#in missing value imputer codes we have to modify it acording to use it for both

Column: Suburb
Suburb
<class 'str'>    6028
Name: count, dtype: int64
----------------------------------------
Column: Rooms
Rooms
<class 'int'>    6028
Name: count, dtype: int64
----------------------------------------
Column: Type
Type
<class 'str'>    6028
Name: count, dtype: int64
----------------------------------------
Column: Price
Price
<class 'int'>    6028
Name: count, dtype: int64
----------------------------------------
Column: Method
Method
<class 'str'>    6028
Name: count, dtype: int64
----------------------------------------
Column: SellerG
SellerG
<class 'str'>    6028
Name: count, dtype: int64
----------------------------------------
Column: Distance
Distance
<class 'float'>    6028
Name: count, dtype: int64
----------------------------------------
Column: Postcode
Postcode
<class 'int'>    6028
Name: count, dtype: int64
----------------------------------------
Column: Bedroom2
Bedroom2
<class 'float'>    6028
Name: count, dtype: int64
--------------------------------

In [137]:
class missing_value_imputer1(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.impute_dict = {}
        self.feature_names = []

    def fit(self, X, y=None):
        self.feature_names = X.columns
        
        for col in X.columns:
            # Convert object columns to strings before assigning 'missing'
            if X[col].dtype == 'O':
                X[col] = X[col].astype(str).fillna("missing")  # Ensure all values are strings
                self.impute_dict[col] = "missing"
            else:
                # Ensure only numeric data is used for median calculation
                self.impute_dict[col] = pd.to_numeric(X[col], errors="coerce").median()
                
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            if col in self.impute_dict:
                X[col] = X[col].fillna(self.impute_dict[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return self.feature_names


In [138]:
class variable_selector(BaseEstimator, TransformerMixin):

    def __init__(self,feature_names):
        self.feature_names=feature_names # This will track new feature names AFTER dummy variable creation
  
    def fit(self, x, y = None):
        return self # When there is NOTHING to LEARN or NO OWN LOGIC to be implemented, return self is a standard practice
    
    def transform(self, X):
        """
        This will just "subset" all the variables in the df. So, no explicit learning applied here
        """
        X = X[self.feature_names]  
        return X
    
    def get_feature_names_out(self, input_features=None):
        return self.feature_names # This will just "return" all the variable names usee in this particular pipeline


In [139]:
class create_dummies(BaseEstimator, TransformerMixin):
    
    def __init__(self,freq_cutoff=0):
        self.freq_cutoff=freq_cutoff
        self.var_cat_dict={}
        self.feature_names=[] 
        
    def fit(self, x, y = None):
        """
        Create dummy variables for categories that are above a certain a freq_cutoff. 
        For all other categories less than freq_cutoff, NO dummy variables will be created
        
        We need to learn from the data which those "categories" are which will be dummy created, so that 
        in future data, we can dummy create those categories
        """
        data_cols=x.columns
        
        for col in data_cols:
            
            # Store the category counts in k
            k=x[col].value_counts()
            
            # If a categorical variable has no categories less than freq_cutoff (20)
            if (k<=self.freq_cutoff).sum()==0:
                cats=k.index[:-1] # Take every category but the last one to maintain n-1 dummies 
            else:
                # Select all the categories that are greater than freq_cutoff
                cats=k.index[k>self.freq_cutoff]
            
            # Store those selected categories (> freq_cutoff) in "var_cat_dict". This will "IGNORE" all categories less than freq_cutoff
            self.var_cat_dict[col]=cats
        
        # Create the new dummy variable's names by appending column name and the category name with an underscore in between
        for col in self.var_cat_dict.keys():
            for cat in self.var_cat_dict[col]:
                self.feature_names.append(col + '_' + cat)
        return self
    
    def transform(self, X):
        """
        Apply/ Create the SAME dummy variables ("LEARNT" from the past dataset) for NEW/ FUTURE dataset 
        """        
        dummy_data=X.copy() # .copy() method will create a copy of "X", but in a completely new memory location in the RAM.

        for col in self.var_cat_dict.keys():
            
            for cat in self.var_cat_dict[col]:
                name = col + '_' + cat
                dummy_data[name]=(dummy_data[col]==cat).astype(int) # This line of code creates the "dummy variables"
            del dummy_data[col]
        
        return dummy_data
    
    def get_feature_names_out(self, input_features=None):
        return self.feature_names


In [159]:
# Action needed is Impute with median (null value) for - Bedroom2 ,Bathroom ,Car ,Landsize ,BuildingArea,
# YearBuilt,Distance,Rooms,Postcode
# Pipeline 1: impute missing values with median
p3 = Pipeline([
    ('var_select', variable_selector(['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Property_Age',
 'Total_Rooms',
 'Rooms_per_sqm',
 'Land_Usage'])),
    ('missing_trt',missing_value_imputer1())
])

In [160]:
### suburb & SellerG & Method &Type & CouncilArea,StreetName - ACTION NEEDED: create dummies, considering frequency cutoff # Pipeline 2: create dummies for these, considering frequency cutoff as 20 for numeric nd catagorical data
p2 = Pipeline([
('var_select',variable_selector(['Suburb', 'SellerG','Method','Type','StreetName','CouncilArea'])),
('missing_trt',missing_value_imputer1()),
('create_dummies',create_dummies (20))
])

In [161]:
# FeatureUnion concatenates results of multiple transformer objects
#all pipelines stiched together
data_pipe = FeatureUnion([
('obj_to_dummmy',p2),
('fill_missing',p3),
])

# Fit pipeline (Training or Learning)

In [162]:
# Learn from training data
# fit() method will invoke fit() from each task in each of the pipeline to "LEARN" from historical data
data_pipe.fit(ld_train)

In [163]:
len(data_pipe.get_feature_names_out())

177

In [164]:
data_pipe.get_feature_names_out()

array(['obj_to_dummmy__Suburb_Reservoir',
       'obj_to_dummmy__Suburb_Bentleigh East',
       'obj_to_dummmy__Suburb_Richmond', 'obj_to_dummmy__Suburb_St Kilda',
       'obj_to_dummmy__Suburb_Preston',
       'obj_to_dummmy__Suburb_South Yarra',
       'obj_to_dummmy__Suburb_Brunswick',
       'obj_to_dummmy__Suburb_Essendon', 'obj_to_dummmy__Suburb_Glenroy',
       'obj_to_dummmy__Suburb_Brighton', 'obj_to_dummmy__Suburb_Coburg',
       'obj_to_dummmy__Suburb_Glen Iris',
       'obj_to_dummmy__Suburb_Brighton East',
       'obj_to_dummmy__Suburb_Hawthorn',
       'obj_to_dummmy__Suburb_Port Melbourne',
       'obj_to_dummmy__Suburb_Pascoe Vale', 'obj_to_dummmy__Suburb_Kew',
       'obj_to_dummmy__Suburb_Northcote',
       'obj_to_dummmy__Suburb_Balwyn North',
       'obj_to_dummmy__Suburb_Bentleigh',
       'obj_to_dummmy__Suburb_Footscray',
       'obj_to_dummmy__Suburb_Moonee Ponds',
       'obj_to_dummmy__Suburb_Carnegie',
       'obj_to_dummmy__Suburb_Malvern East',
       'obj_

# Apply or Transform using built Pipeline

In [165]:
# Apply pipeline on training data

# transform() method will invoke transform() from each task in each of the pipeline to "APPLY" the learnings
x_train=pd.DataFrame(data=data_pipe.transform(ld_train),
                    columns=data_pipe.get_feature_names_out())

In [166]:
x_train

Unnamed: 0,obj_to_dummmy__Suburb_Reservoir,obj_to_dummmy__Suburb_Bentleigh East,obj_to_dummmy__Suburb_Richmond,obj_to_dummmy__Suburb_St Kilda,obj_to_dummmy__Suburb_Preston,obj_to_dummmy__Suburb_South Yarra,obj_to_dummmy__Suburb_Brunswick,obj_to_dummmy__Suburb_Essendon,obj_to_dummmy__Suburb_Glenroy,obj_to_dummmy__Suburb_Brighton,...,fill_missing__Bedroom2,fill_missing__Bathroom,fill_missing__Car,fill_missing__Landsize,fill_missing__BuildingArea,fill_missing__YearBuilt,fill_missing__Property_Age,fill_missing__Total_Rooms,fill_missing__Rooms_per_sqm,fill_missing__Land_Usage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,2.0,75100.0,120.0,1965.0,60.0,7.0,0.056604,0.45977
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.45977
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,120.0,1970.0,55.0,5.0,0.056604,0.45977
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.45977
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,2.0,604.0,120.0,1965.0,60.0,7.0,0.056604,0.45977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,201.0,120.0,1965.0,60.0,5.0,0.056604,0.45977
6024,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,73.0,2000.0,25.0,5.0,0.067568,73.00000
6025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.45977
6026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.45977


In [167]:
print(ld_test.columns)  # Check all columns


Index(['Suburb', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Distance',
       'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
       'YearBuilt', 'CouncilArea', 'Property_Age', 'Total_Rooms',
       'Rooms_per_sqm', 'Land_Usage', 'StreetName'],
      dtype='object')


In [168]:
#from the aadress i am just extracting street name from test data

# Extract only the street name (everything between number and street type)
ld_test['StreetName'] = ld_test['Address'].str.extract(r'\d+(?:/\d+)?\s+([\w\s]+)')

print(ld_test.head(10)

)

KeyError: 'Address'

In [169]:
ld_test.drop(columns=['Address'], inplace=True) #inplace true means drop it in main DF or false means it gives new df

KeyError: "['Address'] not found in axis"

In [170]:
# Apply pipeline on testing data
x_test=pd.DataFrame(data=data_pipe.transform(ld_test),
                    columns=data_pipe.get_feature_names_out())

In [171]:
x_test

Unnamed: 0,obj_to_dummmy__Suburb_Reservoir,obj_to_dummmy__Suburb_Bentleigh East,obj_to_dummmy__Suburb_Richmond,obj_to_dummmy__Suburb_St Kilda,obj_to_dummmy__Suburb_Preston,obj_to_dummmy__Suburb_South Yarra,obj_to_dummmy__Suburb_Brunswick,obj_to_dummmy__Suburb_Essendon,obj_to_dummmy__Suburb_Glenroy,obj_to_dummmy__Suburb_Brighton,...,fill_missing__Bedroom2,fill_missing__Bathroom,fill_missing__Car,fill_missing__Landsize,fill_missing__BuildingArea,fill_missing__YearBuilt,fill_missing__Property_Age,fill_missing__Total_Rooms,fill_missing__Rooms_per_sqm,fill_missing__Land_Usage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,898.0,120.0,1970.0,55.0,5.0,0.056604,0.459770
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,1.0,777.0,154.0,1950.0,75.0,8.0,0.051613,0.197943
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,120.0,1995.0,30.0,5.0,0.056604,0.459770
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,554.0,120.0,1965.0,60.0,7.0,0.056604,0.459770
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,213.0,92.0,1980.0,45.0,5.0,0.053763,0.429907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,1.0,672.0,151.0,1970.0,55.0,9.0,0.059211,0.224368
1504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,260.0,120.0,1965.0,60.0,8.0,0.056604,0.459770
1505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,2.0,646.0,120.0,1965.0,60.0,7.0,0.056604,0.459770
1506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,1.0,178.0,102.0,2009.0,16.0,7.0,0.067961,0.569832


In [172]:
ld_pridict.shape

(1885, 19)

In [173]:
print(ld_pridict.columns)

Index(['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Distance', 'Postcode',
       'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt',
       'CouncilArea', 'StreetName', 'Property_Age', 'Total_Rooms',
       'Rooms_per_sqm', 'Land_Usage'],
      dtype='object')


In [174]:
#from the aadress i am just extracting street name from test data

# Extract only the street name (everything between number and street type)
ld_pridict['StreetName'] = ld_pridict['Address'].str.extract(r'\d+(?:/\d+)?\s+([\w\s]+)')

print(ld_pridict.head(10)

)

KeyError: 'Address'

In [156]:
ld_pridict.drop(columns=['Address'], inplace=True)

# feature engineering for predict data 

In [175]:
# Property Age
ld_pridict['Property_Age'] = 2025 - ld_pridict['YearBuilt']

# Total Rooms (Alternative to Bedroom2)
ld_pridict['Total_Rooms'] = ld_pridict['Rooms'] + ld_pridict['Bedroom2'] + ld_pridict['Bathroom']

# Rooms per Square Meter
ld_pridict['Rooms_per_sqm'] = ld_pridict['Total_Rooms'] / (ld_pridict['BuildingArea'] + 1)  # Avoid division by zero

# Land Efficiency Ratio
ld_pridict['Land_Usage'] = ld_pridict['BuildingArea'] / (ld_pridict['Landsize'] + 1)  # Avoid division by zero


In [176]:
# Apply pipeline on prediction data
x_predict=pd.DataFrame(data=data_pipe.transform(ld_pridict),
                    columns=data_pipe.get_feature_names_out())

In [177]:
x_predict

Unnamed: 0,obj_to_dummmy__Suburb_Reservoir,obj_to_dummmy__Suburb_Bentleigh East,obj_to_dummmy__Suburb_Richmond,obj_to_dummmy__Suburb_St Kilda,obj_to_dummmy__Suburb_Preston,obj_to_dummmy__Suburb_South Yarra,obj_to_dummmy__Suburb_Brunswick,obj_to_dummmy__Suburb_Essendon,obj_to_dummmy__Suburb_Glenroy,obj_to_dummmy__Suburb_Brighton,...,fill_missing__Bedroom2,fill_missing__Bathroom,fill_missing__Car,fill_missing__Landsize,fill_missing__BuildingArea,fill_missing__YearBuilt,fill_missing__Property_Age,fill_missing__Total_Rooms,fill_missing__Rooms_per_sqm,fill_missing__Land_Usage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,120.0,1965.0,60.0,3.0,0.056604,0.459770
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.459770
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.459770
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.459770
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,138.0,105.0,1890.0,135.0,6.0,0.056604,0.755396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,1.0,1.0,326.0,120.0,1965.0,60.0,7.0,0.056604,0.459770
1881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,554.0,129.0,1980.0,45.0,8.0,0.061538,0.232432
1882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.0,2.0,236.0,193.0,2004.0,21.0,8.0,0.041237,0.814346
1883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,0.0,65.0,1970.0,55.0,5.0,0.075758,65.000000


# Add Intercept/ Constant Column in Dataset

In [None]:
x_train = sm.add_constant(x_train)

In [None]:
x_train
# Should be able to see a "const" named column having all 1s in it to see the results is everything is zero

# Multicollinearity Check

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["VIF"] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]

In [None]:
vif_data["Feature"] = x_train.columns

In [None]:
vif_sorted = vif_data.sort_values("VIF", ascending = False)
vif_sorted

# Write a loop to automatically remove highly multicollinear features from data in an iterative manner

In [None]:
# Make a cop to remove the high VIF features
x_train_copy = x_train.copy()

In [None]:
# We can write a for loop to iteratively remove highly multicollinear features which are beyond a threshold of 5

threshold = 5
max_vif = vif_sorted.iloc[0,0]


while max_vif > threshold:
    vif_data = pd.DataFrame()
    vif_data["VIF"] = [variance_inflation_factor(x_train_copy.values, i) for i in range(x_train_copy.shape[1])]
    vif_data["Feature"] = x_train_copy.columns
    vif_sorted = vif_data.sort_values("VIF", ascending = False)
    
    max_vif = vif_sorted.iloc[0,0]
    feature_to_drop = vif_sorted.iloc[0,1]
    
    if max_vif > threshold:
        x_train_copy.drop(columns=[feature_to_drop], axis = 1, inplace = True)
        print(f"Removed {feature_to_drop} having VIF {max_vif}")

In [None]:
# Add the intercept back to the data as we should always include a intercept in the model means always have a constant in the data
X_with_intercept = sm.add_constant(x_train[x_train_copy.columns])

# Exploratory Data Analysis (EDA)

In [None]:
#EDA helps understand which features influence price

#Visualize Correlations

# Select only numerical columns
numerical_cols = ld_train.select_dtypes(include=["number"])

# Compute correlation
plt.figure(figsize=(10,6))
sns.heatmap(numerical_cols.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

# Values range from -1 to 1:
# +1 → Perfect positive correlation (when one variable increases, the other does too).
# -1 → Perfect negative correlation (one increases, the other decreases).
# 0 → No correlation.

In [None]:
#  Analyze Categorical Features
#  Which suburbs are the most expensive?

top_suburbs = ld_train.groupby("Suburb")["Price"].median().sort_values(ascending=False).head(10)
print(top_suburbs)


In [None]:
#  Detect Outliers
# Extreme values in Price can distort predictions.

plt.figure(figsize=(10,5))
sns.boxplot(y=ld_train["Price"])  # Use `y` instead of passing the column directly
plt.show()


In [None]:
#  Remove extreme outliers using IQR method:

Q1 = ld_train["Price"].quantile(0.25)
Q3 = ld_train["Price"].quantile(0.75)
IQR = Q3 - Q1

ld_train = ld_train[(ld_train["Price"] > (Q1 - 1.5 * IQR)) & (ld_train["Price"] < (Q3 + 1.5 * IQR))]


In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(y=ld_train["Price"])  # Use `y` instead of passing the column directly
plt.show()

# Hypothesis Testing

In [None]:
# Does House Type Affect Price?
# Using ANOVA to check if Type impacts price.

from scipy.stats import f_oneway

groups = [ld_train[ld_train["Type"] == t]["Price"] for t in ld_train["Type"].unique()]
anova_result = f_oneway(*groups)
print("P-value:", anova_result.pvalue)

#  If p-value < 0.05, Type significantly affects price.


In [None]:
from scipy.stats import ttest_ind

high_price = ld_train[ld_train["Price"] > ld_train["Price"].median()]
low_price = ld_train[ld_train["Price"] <= ld_train["Price"].median()]

t_stat, p_value = ttest_ind(high_price["Rooms"], low_price["Rooms"])
print(f"T-Statistic: {t_stat}, P-Value: {p_value}")


# Interpretation:

# If p-value < 0.05, Rooms significantly affect Price.
# If p-value > 0.05, Rooms do not strongly impact Price.

# Build Model 1 linear regression 

In [None]:
# Reset the index from 0 to n. Else this will throw an error in modeling step below
y_train.reset_index(drop = True, inplace = True)
y_train.index

# Index should match the below X_with_intercept index

In [None]:
# Index should match the above y_train index
X_with_intercept.index

In [None]:
# Fit the linear regression model
model = sm.OLS(y_train, X_with_intercept).fit()

# Print the regression summary
print(model.summary())

In [None]:
# Access all pvalues
model.pvalues

In [None]:
X_with_intercept_copy = X_with_intercept.copy()

In [None]:
# Similar to VIF, we can write a for loop to iteratively remove high p-value features beyond a threshold of 0.1

threshold = 0.1
max_pvalue = model.pvalues.iloc[1:].max()

while max_pvalue > threshold:
    
    temp_model = sm.OLS(y_train, X_with_intercept_copy).fit()
    max_pvalue = temp_model.pvalues.iloc[1:].max()
    feature_to_drop = temp_model.pvalues.idxmax()
    
    if max_pvalue > threshold:
        X_with_intercept_copy.drop(columns=[feature_to_drop], axis = 1, inplace = True)
        print(f"Removed {feature_to_drop} having pvalue {max_pvalue}")

In [None]:
X_with_intercept_copy.columns

In [None]:
# Fit the linear regression model
final_model = sm.OLS(y_train, X_with_intercept[X_with_intercept_copy.columns]).fit()

# Print the regression summary
print(final_model.summary())

In [None]:
# Check normality of errors
import seaborn as sns

sns.histplot(final_model.resid, kde = True);

# Some violations may exist as its not completely normally distributed based on visual check

In [None]:
# Check for heterskadasticity

sns.scatterplot(x = final_model.fittedvalues, y = final_model.resid);

# Does seem to show heteroskadstic behaviour - As he predicted values in increasing on x-axis, the residual range on y-axis
# seem to increase too

# Test Data Prediction

In [None]:
final_model.params
# Pick only those features that exist in the model (using final_model.params.index). In addition, add "const" to x_test
x_test_intercept = sm.add_constant(x_test[final_model.params.index[1:]])

In [None]:
predict_test = final_model.predict(x_test_intercept)

In [None]:
predict_test

# Model Validation

In [None]:
# RMSE (root mean square error)   is high in this model

# Error = Actual - Prediction
error = y_test - predict_test

# Calculate RMSE
np.sqrt((error ** 2).mean())



In [None]:
y_test.mean()

# Building 2nd model

In [None]:
ld_pridict["Postcode"].fillna(ld_train["Postcode"].mode()[0], inplace=True)
ld_pridict["Rooms"].fillna(ld_train["Rooms"].median(), inplace=True)


In [None]:
# Predict Price Using Simple Data Analytics
# Using Grouping (Median-Based Prediction)

price_map = ld_train.groupby(["Suburb", "Rooms"])["Price"].median()

def predict_price(row):
    return price_map.get((row["Suburb"], row["Rooms"]), ld_train["Price"].median())

ld_pridict["Predicted_Price"] = ld_pridict.apply(predict_price, axis=1)




In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_actual = ld_train["Price"]
y_pred = ld_train.apply(predict_price, axis=1)

rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
print("RMSE:", rmse)
# Calculate Score
score = 212467 / rmse
print(f"Score: {score}")


In [None]:
#  Apply the Model on Test Data

In [None]:
# Apply prediction function to test data
ld_pridict["Predicted_Price"] = ld_pridict.apply(predict_price, axis=1)


In [None]:
# Ensure the DataFrame contains only the required columns
submission = pd.DataFrame({
    "Price": ld_pridict["Predicted_Price"]
})

# Save as CSV in the required format
submission.to_csv("Ankit_Raj_P1_part2.csv", index=False)


In [None]:
ld_pridict.head(10)

# Building 3rd model

In [None]:
# Alternative: Price per Square Meter
# Another approach is to calculate Price per Square Meter and apply it to test data.

ld_train["Price_per_sqm"] = ld_train["Price"] / ld_train["BuildingArea"]
median_price_per_sqm = ld_train["Price_per_sqm"].median()

ld_test["Predicted_Price1"] = ld_test["BuildingArea"] * median_price_per_sqm

In [None]:
print(ld_train[["Price", "BuildingArea"]].isnull().sum())  # Check missing values


In [None]:
# Fill missing BuildingArea with median value
ld_train["BuildingArea"].fillna(ld_train["BuildingArea"].median(), inplace=True)


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Ensure no NaNs before RMSE calculation
ld_train = ld_train.dropna(subset=["Price"])  # Ensure target variable has no NaNs

# Compute RMSE
rmse = np.sqrt(mean_squared_error(ld_train["Price"], ld_train["BuildingArea"] * median_price_per_sqm))
print(f"RMSE: {rmse}")

# Calculate Score
score = 212467 / rmse
print(f"Score: {score}")



In [None]:
# Conclusion out of all 3 model 2nd is the best model which got mode than 0.51 score

In [None]:
#2nd model is best of applied it on test dta nd pridicted the price