Authors: 

Jonathan Naumanen

Adam Williams

Group 17

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Read the CSV file using Pandas.
alldata = pd.read_csv("sberbank.csv")

# Convert the timestamp string to an integer representing the year.
def get_year(timestamp):
    return int(timestamp[:4])
alldata['year'] = alldata.timestamp.apply(get_year)

# Select the 9 input columns and the output column.
selected_columns = ['price_doc', 'year', 'full_sq', 'life_sq', 'floor', 'num_room', 'kitch_sq', 'full_all']
alldata = alldata[selected_columns]
alldata = alldata.dropna()

# Shuffle.
alldata_shuffled = alldata.sample(frac=1.0, random_state=0)

# Separate the input and output columns.
X = alldata_shuffled.drop('price_doc', axis=1)
# For the output, we'll use the log of the sales price.
Y = alldata_shuffled['price_doc'].apply(np.log)

# Split into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

In [2]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate
m1 = DummyRegressor()
cross_validate(m1, Xtrain, Ytrain, scoring='neg_mean_squared_error')

{'fit_time': array([0.00190258, 0.00136113, 0.00130987, 0.01111007, 0.00601125]),
 'score_time': array([0.00033855, 0.00036502, 0.00033617, 0.00032377, 0.00027323]),
 'test_score': array([-0.39897319, -0.37113485, -0.38083108, -0.39057156, -0.40475168])}

In [3]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
cvs = cross_validate(LR, Xtrain, Ytrain, scoring='neg_mean_squared_error')
sum = np.sum(cvs['test_score'])
print(sum)

-1.506993294383618


In [4]:
from sklearn.linear_model import Ridge

RC = Ridge()
cvs1 = cross_validate(RC, Xtrain, Ytrain, scoring='neg_mean_squared_error')
sum = np.sum(cvs1['test_score'])
print(sum)
print(cvs1)

-1.506989211608986
{'fit_time': array([0.00515771, 0.00546217, 0.00729012, 0.01120687, 0.0084095 ]), 'score_time': array([0.00192213, 0.00340343, 0.04272079, 0.00398326, 0.00463009]), 'test_score': array([-0.30222063, -0.32537046, -0.29377831, -0.29296256, -0.29265724])}


In [5]:
from sklearn.linear_model import Lasso

LA = Lasso()
cvs2 = cross_validate(LA, Xtrain, Ytrain, scoring='neg_mean_squared_error')
sum = np.sum(cvs2['test_score'])
print(sum)
print(cvs2)

-1.5052353358744357
{'fit_time': array([0.14650202, 0.02501798, 0.0177846 , 0.08854651, 0.10489345]), 'score_time': array([0.0163548 , 0.00848842, 0.00573444, 0.00927639, 0.09084105]), 'test_score': array([-0.31042005, -0.29379119, -0.29803599, -0.30061325, -0.30237486])}


In [6]:
from sklearn.tree import DecisionTreeRegressor

DTR = DecisionTreeRegressor()
cvs3 = cross_validate(DTR, Xtrain, Ytrain, scoring='neg_mean_squared_error')
sum = np.sum(cvs3['test_score'])
print(sum)
print(cvs3)

-2.6534581159474895
{'fit_time': array([0.09451199, 0.09364438, 0.10781622, 0.09544516, 0.09977365]), 'score_time': array([0.00331044, 0.00327563, 0.00340319, 0.05399656, 0.00325894]), 'test_score': array([-0.54288598, -0.52986975, -0.51006824, -0.51518494, -0.55544921])}


In [7]:
from sklearn.ensemble import RandomForestRegressor

RFR = RandomForestRegressor()
cvs4 = cross_validate(RFR, Xtrain, Ytrain, scoring='neg_mean_squared_error')
sum = np.sum(cvs4['test_score'])
print(sum)
print(cvs4)

-1.4198714705602367
{'fit_time': array([6.06654477, 6.27522326, 5.57514   , 6.29206491, 6.2064116 ]), 'score_time': array([0.12997293, 0.12888098, 0.18383121, 0.13359714, 0.17825484]), 'test_score': array([-0.29518221, -0.27612773, -0.27747347, -0.28380374, -0.28728432])}


In [10]:
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor()
cvs5 = cross_validate(GBR, Xtrain, Ytrain, scoring='neg_mean_squared_error')
sum = np.sum(cvs5['test_score'])
print(sum)
print(cvs5)

-1.322627481335389
{'fit_time': array([1.47715425, 1.49016285, 1.39351153, 1.39600492, 1.28998637]), 'score_time': array([0.0049696 , 0.00635719, 0.00682545, 0.00777078, 0.00472307]), 'test_score': array([-0.2763972 , -0.24919857, -0.26293749, -0.27079364, -0.26330058])}


In [9]:
from sklearn.neural_network import MLPRegressor

MLP = MLPRegressor()
cvs6 = cross_validate(MLP, Xtrain, Ytrain, scoring='neg_mean_squared_error')
sum = np.sum(cvs6['test_score'])
print(sum)
print(cvs6)

-385960.37667216855
{'fit_time': array([10.38660026, 14.70609689, 13.80525041, 15.49562764,  2.19097424]), 'score_time': array([0.07110858, 0.00722504, 0.00872755, 0.0043931 , 0.00375319]), 'test_score': array([-4.45266981e+00, -4.75163860e+01, -5.14802169e+01, -2.32683381e+01,
       -3.85833659e+05])}


For the report

We tested all of the regression models and decided that GradientBoostingRegressor gave us the best result with the best time. It got the summed neg_mean_squared_error: -1.322627481335389

In [15]:
from sklearn.metrics import mean_squared_error
  
GBR.fit(Xtrain, Ytrain)
meanSerror = mean_squared_error(Ytest, GBR.predict(Xtest))
print(meanSerror)

0.2714269292429018


Evaluation score: 0.2714662926095485

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=190ad4d2-4321-416d-9847-7401b32c451a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>