In [37]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

In [12]:
def load_csv(city):
    needed_columns = ['review_scores_rating','xgb_predict_avg','rf_predict_avg']
    return pd.read_csv(f'../data/cities/finished/listings_{city}_finished.csv', usecols=needed_columns)

In [61]:
def print_ml_statistics(df):
    result = np.sqrt(mean_squared_error(df.review_scores_rating, df.xgb_predict_avg))
    print(f'RMSE XGBoost: {result}')
    
    result = np.sqrt(mean_squared_error(df.review_scores_rating, df.rf_predict_avg))
    print(f'RMSE RandomForest: {result}')

    result = r2_score(df.review_scores_rating, df.xgb_predict_avg)
    print(f'r2_score XGBoost: {result}')
    
    result = r2_score(df.review_scores_rating, df.rf_predict_avg)
    print(f'r2_score RandomForest: {result}')
    
    result = accuracy_score(np.round(df.review_scores_rating).astype('int'), np.round(df.xgb_predict_avg).astype('int'))
    print(f'accuracy XGBoost: {result}')
    
    result = accuracy_score(np.round(df.review_scores_rating).astype('int'), np.round(df.rf_predict_avg).astype('int'))
    print(f'accuracy RandomForest: {result}') 

    print('Mean comparisson')
    
    df['mean'] = df.review_scores_rating.mean()

    result = np.sqrt(mean_squared_error(df.review_scores_rating, df['mean']))
    print(f'RMSE mean: {result}')

    result = np.sqrt(r2_score(df.review_scores_rating, df['mean']))
    print(f'r2_score mean: {result}')

    result = accuracy_score(np.round(df.review_scores_rating).astype('int'), np.round(df['mean']).astype('int'))
    print(f'accuracy mean: {result}')


In [62]:
print_ml_statistics(load_csv('Amsterdam'))

RMSE XGBoost: 0.23104299694600555
RMSE RandomForest: 0.2305485191843467
r2_score XGBoost: -0.0009752725276919172
r2_score RandomForest: 0.00330471312480507
accuracy XGBoost: 0.931950745301361
accuracy RandomForest: 0.9252106286454957
Mean comparisson
RMSE mean: 0.23093041434453632
r2_score mean: 0.0
accuracy mean: 0.9231367465975373


In [63]:
print_ml_statistics(load_csv('Antwerp'))

RMSE XGBoost: 0.38176040468489514
RMSE RandomForest: 0.37472557352018376
r2_score XGBoost: 0.08385748737313137
r2_score RandomForest: 0.11731054612963576
accuracy XGBoost: 0.8174196553330229
accuracy RandomForest: 0.8188169538891477
Mean comparisson
RMSE mean: 0.39884978664401527
r2_score mean: 0.0
accuracy mean: 0.775966464834653


In [64]:
print_ml_statistics(load_csv('Rotterdam'))

RMSE XGBoost: 0.31690915886024046
RMSE RandomForest: 0.30572024595014163
r2_score XGBoost: -0.01806114278112858
r2_score RandomForest: 0.05255789582541348
accuracy XGBoost: 0.8686534216335541
accuracy RandomForest: 0.8642384105960265
Mean comparisson
RMSE mean: 0.3140854801702917
r2_score mean: 0.0
accuracy mean: 0.8388520971302428


In [65]:
print_ml_statistics(load_csv('LosAngeles'))

RMSE XGBoost: 0.30668211559073877
RMSE RandomForest: 0.3065237607360384
r2_score XGBoost: 0.5605499671494588
r2_score RandomForest: 0.5610036687281494
accuracy XGBoost: 0.901825221238938
accuracy RandomForest: 0.8945120452310718
Mean comparisson
RMSE mean: 0.46262990197038356
r2_score mean: 0.0
accuracy mean: 0.8673795476892822


In [70]:
test = load_csv('LosAngeles')
print(test.shape)
print(test[test.review_scores_rating < 4].shape)

(32544, 3)
(1052, 3)


In [71]:
test = load_csv('Antwerp')
print(test.shape)
print(test[test.review_scores_rating < 4].shape)

(2147, 3)
(103, 3)


(1052, 3)