In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [2]:
data = pd.read_csv('Melbourne_housing_FULL.csv')

In [3]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [4]:
newdata = data.drop(columns=['Address','Date', 'Postcode', 'YearBuilt', 'Lattitude', 'Longtitude'])

In [5]:
newdata.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,2.5,2.0,1.0,1.0,126.0,,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,2.5,3.0,2.0,1.0,0.0,,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0


In [6]:
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Rooms          34857 non-null  int64  
 2   Type           34857 non-null  object 
 3   Price          27247 non-null  float64
 4   Method         34857 non-null  object 
 5   SellerG        34857 non-null  object 
 6   Distance       34856 non-null  float64
 7   Bedroom2       26640 non-null  float64
 8   Bathroom       26631 non-null  float64
 9   Car            26129 non-null  float64
 10  Landsize       23047 non-null  float64
 11  BuildingArea   13742 non-null  float64
 12  CouncilArea    34854 non-null  object 
 13  Regionname     34854 non-null  object 
 14  Propertycount  34854 non-null  float64
dtypes: float64(8), int64(1), object(6)
memory usage: 4.0+ MB


In [7]:
print(newdata.isnull().sum())

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        3
dtype: int64


In [8]:
newdata.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,2.5,2.0,1.0,1.0,126.0,,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,2.5,3.0,2.0,1.0,0.0,,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0


In [9]:
class OrdinalEncoderAndStandardScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mean=None, var=None, encoding_dict=None):
        self.mean = mean
        self.var = var
        self.encoding_dict = encoding_dict

    def fit(self, x, y=None):
        self.ordinal_encoder = OrdinalEncoder()
        self.scaler = StandardScaler()
        return self

    def transform(self, x, y=None):
        series_name = x.name
        _x = x.to_numpy().reshape(-1, 1)
        _x = self.ordinal_encoder.fit_transform(_x)
        categories = self.ordinal_encoder.categories_
        self.encoding_dict = dict(zip((categories[0]), range(len(categories[0]))))
        _x = np.squeeze(self.scaler.fit_transform(_x))
        self.mean = self.scaler.mean_[0]
        self.var = self.scaler.var_[0]
        return pd.Series(_x, name=series_name)

In [10]:
def convert_to_numerical(column):
    column_name = column
    transformer = OrdinalEncoderAndStandardScalerTransformer()
    pipeline = Pipeline([('transform', transformer)])
    transformed_column = pipeline.fit_transform(data[column_name])
    newdata[column_name] = transformed_column

In [11]:
categorical_list = ['Suburb', 'Type', 'Method', 'CouncilArea', 'Regionname','SellerG']
for i in categorical_list:
    convert_to_numerical(i)

In [12]:
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  float64
 1   Rooms          34857 non-null  int64  
 2   Type           34857 non-null  float64
 3   Price          27247 non-null  float64
 4   Method         34857 non-null  float64
 5   SellerG        34857 non-null  float64
 6   Distance       34856 non-null  float64
 7   Bedroom2       26640 non-null  float64
 8   Bathroom       26631 non-null  float64
 9   Car            26129 non-null  float64
 10  Landsize       23047 non-null  float64
 11  BuildingArea   13742 non-null  float64
 12  CouncilArea    34854 non-null  float64
 13  Regionname     34854 non-null  float64
 14  Propertycount  34854 non-null  float64
dtypes: float64(14), int64(1)
memory usage: 4.0 MB


In [13]:
newdata.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,-1.706538,2,-0.637822,,1.6529,-0.268711,2.5,2.0,1.0,1.0,126.0,,1.744197,-0.810997,4019.0
1,-1.706538,2,-0.637822,1480000.0,-0.358644,-1.374997,2.5,2.0,1.0,1.0,202.0,,1.744197,-0.810997,4019.0
2,-1.706538,2,-0.637822,1035000.0,-0.358644,-1.374997,2.5,2.0,1.0,0.0,156.0,79.0,1.744197,-0.810997,4019.0
3,-1.706538,3,1.808811,,2.155786,1.009865,2.5,3.0,2.0,1.0,0.0,,1.744197,-0.810997,4019.0
4,-1.706538,3,-0.637822,1465000.0,1.150014,-1.374997,2.5,3.0,2.0,0.0,134.0,150.0,1.744197,-0.810997,4019.0


In [14]:
print(newdata.isnull().sum())

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        3
dtype: int64


In [15]:
newdata = newdata.drop(columns=['BuildingArea'])

In [16]:
newdata.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,CouncilArea,Regionname,Propertycount
0,-1.706538,2,-0.637822,,1.6529,-0.268711,2.5,2.0,1.0,1.0,126.0,1.744197,-0.810997,4019.0
1,-1.706538,2,-0.637822,1480000.0,-0.358644,-1.374997,2.5,2.0,1.0,1.0,202.0,1.744197,-0.810997,4019.0
2,-1.706538,2,-0.637822,1035000.0,-0.358644,-1.374997,2.5,2.0,1.0,0.0,156.0,1.744197,-0.810997,4019.0
3,-1.706538,3,1.808811,,2.155786,1.009865,2.5,3.0,2.0,1.0,0.0,1.744197,-0.810997,4019.0
4,-1.706538,3,-0.637822,1465000.0,1.150014,-1.374997,2.5,3.0,2.0,0.0,134.0,1.744197,-0.810997,4019.0


In [17]:
def replace_mean(column_name):    
    mean_value = newdata[column_name].mean()
    newdata[column_name].fillna(mean_value, inplace=True)

In [18]:
mean_column = ['Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'CouncilArea', 'Regionname' , 'Propertycount']
for i in mean_column:
    replace_mean(i)
    

In [19]:
newdata.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,CouncilArea,Regionname,Propertycount
0,-1.706538,2,-0.637822,1050173.0,1.6529,-0.268711,2.5,2.0,1.0,1.0,126.0,1.744197,-0.810997,4019.0
1,-1.706538,2,-0.637822,1480000.0,-0.358644,-1.374997,2.5,2.0,1.0,1.0,202.0,1.744197,-0.810997,4019.0
2,-1.706538,2,-0.637822,1035000.0,-0.358644,-1.374997,2.5,2.0,1.0,0.0,156.0,1.744197,-0.810997,4019.0
3,-1.706538,3,1.808811,1050173.0,2.155786,1.009865,2.5,3.0,2.0,1.0,0.0,1.744197,-0.810997,4019.0
4,-1.706538,3,-0.637822,1465000.0,1.150014,-1.374997,2.5,3.0,2.0,0.0,134.0,1.744197,-0.810997,4019.0


In [20]:
x = newdata.drop(columns=['Price'])
y = newdata['Price']

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [22]:
model = LinearRegression()

In [23]:
model.fit(x_train, y_train)

In [24]:
pred = model.predict(x_test)

In [25]:
print(pred)

[1073180.37026468 1034204.42645836  718105.22001976 ...  840346.45576525
 1084414.26984346  924625.80643086]


In [26]:
score = r2_score(y_test, pred)

In [27]:
print(score)

0.3215551001729148


In [28]:
df = data.drop(columns=['Address','Date', 'Postcode', 'YearBuilt', 'Lattitude', 'Longtitude'])

In [29]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,2.5,2.0,1.0,1.0,126.0,,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,2.5,3.0,2.0,1.0,0.0,,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0


In [30]:
df = pd.get_dummies(df)

In [31]:
df.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Abbotsford,...,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
0,2,,2.5,2.0,1.0,1.0,126.0,,4019.0,1,...,1,0,0,0,1,0,0,0,0,0
1,2,1480000.0,2.5,2.0,1.0,1.0,202.0,,4019.0,1,...,1,0,0,0,1,0,0,0,0,0
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,1,...,1,0,0,0,1,0,0,0,0,0
3,3,,2.5,3.0,2.0,1.0,0.0,,4019.0,1,...,1,0,0,0,1,0,0,0,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,1,...,1,0,0,0,1,0,0,0,0,0


In [32]:
df = df.dropna()

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9244 entries, 2 to 34856
Columns: 801 entries, Rooms to Regionname_Western Victoria
dtypes: float64(8), int64(1), uint8(792)
memory usage: 7.7 MB


In [34]:
x = newdata.drop(columns=['Price'])
y = newdata['Price']

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [36]:
model = LinearRegression()

In [37]:
model.fit(x_train, y_train)

In [38]:
pred = model.predict(x_test)

In [39]:
score = r2_score(y_test, pred)

In [40]:
print(score)

0.31986524454795706
