In [117]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import scraping_class
import re
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from Connector import Connector, ratelimit
import time
import os

In [118]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import validation_curve
from sklearn.pipeline import make_pipeline

In [151]:
dir = str(os.getcwd())
datafile = dir + "/boliga/data/analysis_data.csv"
df = pd.read_csv(datafile, index_col=0)
#seperating municipality
address = pd.DataFrame([a[0] + ' ' + a[-1][6:]\
           for a in df['Address'].str.split(',')])
header=['ad']
address.columns=header

municipalities = ['Frederiksberg C', 'Frederiksberg', 'København K', 'København V', 'København S', 'København N', 'København Ø']

address_list = []

for i in address['ad']: 
    for x in municipalities:
        if x in i:
            address_list.append(x)
            break

# Making a list of municipalities            
Municipality = pd.DataFrame(address_list)
Municipality.columns=['Municipality']

# Adding the list to the housing data
df['Municipality'] = Municipality
df
df = df.drop(['Sell_price', 'sqm_price','Date_of_sale','Address', 'location', 'latitude', 'longitude','m_distance_const', 'm_station_const', 's_distance_const',
       's_station_const', 's_station', 'm_station'], axis=1).dropna();
df = pd.get_dummies(df)
df = df.reset_index(drop=True)
y = df.z_sqm_price.to_numpy()
X = df.drop(['z_sqm_price', 'Type_ Villa ', 'Municipality_København K'], axis=1).to_numpy()
X = np.array(X, dtype=np.float64)
X, y;

In [131]:
# splitting into development (2/3) and test data (1/3)
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=1/3, random_state=34)

# splitting development into train (1/3) and validation (1/3)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=1/2, random_state=52)

lambda_ = np.logspace(-1, 1, 22)
l1_ratio_ = np.logspace(0, 3, 22)
tol = 0.001

## OLS Regression

In [None]:
pipe_OLS = make_pipeline(PolynomialFeatures(degree = 3, include_bias=True), 
                           StandardScaler(),
                           LinearRegression())
                         
pipe_OLS.fit(X_dev, y_dev)
print(mse(pipe_OLS.predict(X_test),y_test)), round(np.sqrt(mse(pipe_OLS.predict(X_test),y_test))))

## Lasso Regression

In [164]:
pipe_lasso = make_pipeline(PolynomialFeatures(degree = 3, include_bias=True), 
                           StandardScaler(),
                           Lasso(max_iter = 10000, alpha=lambda_, tol=tol))

train_scores, test_scores = validation_curve(estimator=pipe_lasso,
                                             X=X_train, y=y_train,
                                             param_name='lasso__alpha',
                                             param_range=lambda_,
                                             scoring='neg_mean_squared_error',
                                             cv=5)

mse_score = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                          'Validation':-test_scores.mean(axis=1),
                          'lambda':lambda_}).set_index('lambda')   

optimal_lambda_lasso = mse_score.Validation.nsmallest(1)
lambda_o_ = optimal_lambda_lasso.index

#testing
pipe_lasso = make_pipeline(PolynomialFeatures(degree = 3, include_bias=True), 
                           StandardScaler(),
                           Lasso(max_iter = 10000, alpha=optimal_lambda_lasso.index, tol=tol))

pipe_lasso.fit(X_dev, y_dev)
print(lambda_o_,' ',round(mse(pipe_lasso.predict(X_test),y_test),3), round(np.sqrt(mse(pipe_lasso.predict(X_test),y_test)),3))

Float64Index([0.1], dtype='float64', name='lambda')   754.863 27.475


## Ridge Regression

In [178]:
pipe_ridge = make_pipeline(PolynomialFeatures(degree = 3, include_bias=True), 
                           StandardScaler(),
                           Ridge(alpha = l1_ratio_, tol=tol))

train_scores, test_scores = validation_curve(estimator=pipe_ridge,
                                             X=X_train, y=y_train,
                                             param_name='ridge__alpha',
                                             param_range=l1_ratio_,
                                             scoring='neg_mean_squared_error',
                                             cv=5)

mse_score = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                          'Validation':-test_scores.mean(axis=1),
                          'lambda':l1_ratio_}).set_index('lambda')   

lambda_o_ = mse_score.Validation.nsmallest(1)
lambda_o_ = lambda_o_.index

In [179]:
#testing
pipe_ridge = make_pipeline(PolynomialFeatures(degree = 3, include_bias=True), 
                           StandardScaler(),
                           Ridge(alpha = lambda_o_, tol=tol))

pipe_ridge.fit(X_dev, y_dev)
print(lambda_o_,' ', round(mse(pipe_ridge.predict(X_test),y_test),3), round(np.sqrt(mse(pipe_ridge.predict(X_test),y_test)),3))

Float64Index([100.0], dtype='float64', name='lambda')   761.577 27.597


## LassoCV

In [170]:
from sklearn.model_selection import KFold
kfolds = KFold(n_splits=10)
folds = list(kfolds.split(X_dev, y_dev))

# outer loop: lambdas
mseCV = []
for lambda__ in lambda_:    
    # inner loop: folds
    mseCV_ = []    
    for train_idx, val_idx in folds :        
        # train model and compute MSE on test fold
        pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=3, include_bias=False),
                                     StandardScaler(),
                                     Lasso(max_iter = 10000, alpha=lambda__, tol=tol))            
        X_train, y_train, = X_dev[train_idx], y_dev[train_idx]
        X_val, y_val = X_dev[val_idx], y_dev[val_idx] 
        pipe_lassoCV.fit(X_train, y_train)        
        mseCV_.append(mse(pipe_lassoCV.predict(X_val), y_val))    
        
    # store result    
    mseCV.append(mseCV_) 
    
# convert to DataFrame
lambdaCV = pd.DataFrame(mseCV, index=lambda_)
lambdaCV['m'] = lambdaCV.mean(axis=1)
lambda_o_ = lambdaCV['m'].nsmallest(1).index

In [172]:
pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=3, include_bias=False),
                                     StandardScaler(),
                                     Lasso(max_iter = 10000, alpha=lambda_o_, tol=tol))

pipe_lassoCV.fit(X_dev, y_dev)
print(lambda_o_,' ',round(mse(pipe_lassoCV.predict(X_test),y_test),3), round(np.sqrt(mse(pipe_lassoCV.predict(X_test),y_test)),3))

Float64Index([0.12451970847350328], dtype='float64')   754.278 27.464


## RidgeCV

In [173]:
# outer loop: lambdas
mseCV = []
for l1_ratio__ in l1_ratio_:    
    # inner loop: folds
    mseCV_ = []    
    for train_idx, val_idx in folds :        
        # train model and compute MSE on test fold
        pipe_ridgeCV = make_pipeline(PolynomialFeatures(degree = 3, include_bias=True), 
                           StandardScaler(),
                           Ridge(alpha = l1_ratio__, tol=tol))          
        X_train, y_train, = X_dev[train_idx], y_dev[train_idx]
        X_val, y_val = X_dev[val_idx], y_dev[val_idx] 
        pipe_ridgeCV.fit(X_train, y_train)        
        mseCV_.append(mse(pipe_ridgeCV.predict(X_val), y_val))    
        
    # store result    
    mseCV.append(mseCV_) 
    
# convert to DataFrame
lambdaCV = pd.DataFrame(mseCV, index=lambda_)
lambdaCV['m'] = lambdaCV.mean(axis=1)
lambda_o_ = lambdaCV['m'].nsmallest(1).index

In [175]:
pipe_ridgeCV = make_pipeline(PolynomialFeatures(degree = 3, include_bias=True), 
                           StandardScaler(),
                           Ridge(alpha = l1_ratio__, tol=tol))

pipe_ridgeCV.fit(X_dev, y_dev)
print(lambda_o_,' ', round(mse(pipe_ridgeCV.predict(X_test),y_test),3), round(np.sqrt(mse(pipe_ridgeCV.predict(X_test),y_test)),3))

Float64Index([1.7301957388458944], dtype='float64')   760.581 27.579


## ElasticNet

In [185]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

pipe_el = make_pipeline(PolynomialFeatures(include_bias=False), 
                        StandardScaler(),
                        ElasticNet(tol=.01))

gs = GridSearchCV(estimator=pipe_el, 
                  param_grid={'elasticnet__alpha':np.logspace(-1,1,22),
                              'elasticnet__l1_ratio':np.linspace(0,3,10)}, 
                  scoring='neg_mean_squared_error', 
                  n_jobs=8,
                  iid=False,
                  cv=10)

gs.fit(X_train, y_train)
gs.predict(X_train)

  array_means[:, np.newaxis]) ** 2,
  array_means[:, np.newaxis]) ** 2,


array([ -9.18896539,  -2.25294009,  -9.81766336, ...,  16.91604949,
        -1.58708503, -14.135312  ])

In [186]:
print(gs.best_params_)
print(lambda_o_,' ', round(mse(gs.predict(X_test),y_test),3), round(np.sqrt(mse(gs.predict(X_test),y_test)),3))

{'elasticnet__alpha': 0.15505157798326247, 'elasticnet__l1_ratio': 1.0}
Float64Index([100.0], dtype='float64', name='lambda')   772.807 27.799


In [196]:
lambda_o_ = gs.best_params_

In [202]:
lambda_o_['elasticnet__alpha']

0.15505157798326247

In [203]:
pipe_el = make_pipeline(PolynomialFeatures(include_bias=False), 
                        StandardScaler(),
                        ElasticNet(alpha = lambda_o_['elasticnet__alpha'],
                                   l1_ratio= lambda_o_['elasticnet__l1_ratio'],
                                   tol=.01))

In [206]:
pipe_el.fit(X_dev, y_dev)
pipe_el.predict(X_test)

array([10.99173854, 19.97830203, -6.56441019, ...,  5.09878742,
       -1.62984356,  4.02023822])

In [209]:
round(mse(pipe_el.predict(X_test),y_test),3), round(np.sqrt(mse(pipe_el.predict(X_test),y_test)),3)

(769.556, 27.741)

### Latex tabel with overview model fit
```
\begin{table}[]
\begin{tabular}{lcccccc}
 & \multicolumn{1}{l}{\textbf{OLS}} & \multicolumn{1}{l}{\textbf{Lasso}} & \multicolumn{1}{l}{\textbf{LassoCV}} & \multicolumn{1}{l}{\textbf{Ridge}} & \multicolumn{1}{l}{\textbf{RidgeCV}} & \multicolumn{1}{l}{\textbf{ElasticNet}} \\
Lambda & - & 0.100 & .125 & 100.000 & 1.730 & (.155, 1.000) \\
MSE & 5.4e25 & 754.863 & 754.278 & 761.543 & 760.581 & 769.556 \\
RMSE & 7.4e12 & 27.475 & 27.464 & 27.596 & 27.579 & 27.741
\end{tabular}
\end{table}
```

# lassoCV preforms best: 