In [62]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from scipy import stats

In [63]:
data = pd.read_csv('melb_data.csv')

In [64]:
data.shape

(13580, 21)

In [65]:
data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [67]:
data = data.dropna()

In [68]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0
6,Abbotsford,124 Yarra St,3,h,1876000.0,S,Nelson,7/05/2016,2.5,3067.0,...,2.0,0.0,245.0,210.0,1910.0,Yarra,-37.8024,144.9993,Northern Metropolitan,4019.0
7,Abbotsford,98 Charles St,2,h,1636000.0,S,Nelson,8/10/2016,2.5,3067.0,...,1.0,2.0,256.0,107.0,1890.0,Yarra,-37.806,144.9954,Northern Metropolitan,4019.0


In [69]:
data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [70]:
df = data[['Rooms', 'Distance', 'Propertycount', 'YearBuilt', 'Landsize', 'BuildingArea', 'Price']]

In [71]:
df.head()

Unnamed: 0,Rooms,Distance,Propertycount,YearBuilt,Landsize,BuildingArea,Price
1,2,2.5,4019.0,1900.0,156.0,79.0,1035000.0
2,3,2.5,4019.0,1900.0,134.0,150.0,1465000.0
4,4,2.5,4019.0,2014.0,120.0,142.0,1600000.0
6,3,2.5,4019.0,1910.0,245.0,210.0,1876000.0
7,2,2.5,4019.0,1890.0,256.0,107.0,1636000.0


In [72]:
z_scores = stats.zscore(df._get_numeric_data())
abs_z_scores = np.abs(z_scores)

filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]

In [73]:
df.head()

Unnamed: 0,Rooms,Distance,Propertycount,YearBuilt,Landsize,BuildingArea,Price
1,2,2.5,4019.0,1900.0,156.0,79.0,1035000.0
2,3,2.5,4019.0,1900.0,134.0,150.0,1465000.0
4,4,2.5,4019.0,2014.0,120.0,142.0,1600000.0
6,3,2.5,4019.0,1910.0,245.0,210.0,1876000.0
7,2,2.5,4019.0,1890.0,256.0,107.0,1636000.0


In [74]:
x = df.drop(['Price'], axis=1)
y = df['Price']

In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [76]:
reg = LinearRegression()
reg.fit(x_train, y_train)
reg.score(x_test, y_test)

0.5918322636815869

In [77]:
from sklearn.linear_model import Lasso
las = Lasso()
las.fit(x_train, y_train)
las.score(x_test, y_test)

0.5918320270224514

In [78]:
def find_best_algorithm(x, y):
    algos = {
        'Lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['cyclic', 'random']
            }
        },
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        }
    }
        
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2)
    
    for algo_name, config in algos.items():
        grid = GridSearchCV(config['model'], config['params'])
        grid.fit(x, y)
        scores.append({
            'Algorithm': algo_name,
            'Best score': grid.best_score_,
            'Best params': grid.best_params_
        })
    return pd.DataFrame(scores)

find_best_algorithm(x, y)

Unnamed: 0,Algorithm,Best score,Best params
0,Lasso,0.587561,"{'alpha': 2, 'selection': 'random'}"
1,LinearRegression,0.587561,{'normalize': True}


In [79]:
predicted = reg.predict(x_test)

In [80]:
result = pd.DataFrame({'Actual': y_test, 'Predicted': predicted})
result

Unnamed: 0,Actual,Predicted
11504,345000.0,4.333172e+05
9318,1180000.0,1.313431e+06
4397,1710000.0,1.599879e+06
7286,816000.0,1.295223e+06
7235,415000.0,5.270877e+05
...,...,...
5473,945000.0,1.016926e+06
6468,1010000.0,1.294465e+06
1329,625000.0,9.910925e+05
8949,565000.0,7.972648e+05
