In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import sys
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, LeaveOneOut
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb

from get_data import *

In [2]:
ds, cols_model = get_data_model_v2(k_clusters=6)
ds.info()

Database object created


  df = pd.read_sql_query(query, self.connection)


Best k for clustering zones: 4
Column n_banos has more than 20% of missing values.
Column n_plazas_garaje has more than 20% of missing values.
<class 'pandas.core.frame.DataFrame'>
Int64Index: 724 entries, 0 to 723
Data columns (total 99 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   zona_cluster                                724 non-null    float64
 1   id                                          724 non-null    float64
 2   titulo                                      724 non-null    object 
 3   descripcion                                 711 non-null    object 
 4   extra_info                                  720 non-null    object 
 5   n_habitaciones                              724 non-null    float64
 6   tamano                                      724 non-null    float64
 7   precio                                      724 non-null    float64
 8   municipio        

### Data split

In [66]:
x = ds[cols_model].drop('precio', axis=1)
y = ds['precio']

# slit data into train and val
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [72]:
def train_model(x,y, model, n_splits=5, n_repeats=5, print_results=False):
    return_val = []
    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
    rkf.get_n_splits(x, y)
    for i, (train_index, val_index) in enumerate(rkf.split(x)):
        x_train = x.iloc[train_index]
        y_train = y.iloc[train_index]
        x_val = x.iloc[val_index]
        y_val = y.iloc[val_index]
        model.fit(x_train, y_train)
        y_train_pred = model.predict(x_train)
        y_val_pred = model.predict(x_val)
        rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
        rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
        mae_train = mean_absolute_error(y_train, y_train_pred)
        mae_val = mean_absolute_error(y_val, y_val_pred) 
        return_val.append({'rmse_train': rmse_train, 'rmse_val': rmse_val, 'mae_train': mae_train, 'mae_val': mae_val})
    if print_results:
        rmse_train = [x['rmse_train'] for x in return_val]
        rmse_val = [x['rmse_val'] for x in return_val]
        mae_train = [x['mae_train'] for x in return_val]
        mae_val = [x['mae_val'] for x in return_val]

        print(f'RMSE train:\t mean {np.mean(rmse_train):.2f}\t std {np.std(rmse_train):.2f}')
        print(f'RMSE val:\t mean {np.mean(rmse_val):.2f}\t std {np.std(rmse_val):.2f}')
        print("")
        print(f'MAE train:\t mean {np.mean(mae_train):.2f}\t std {np.std(mae_train):.2f}')
        print(f'MAE val:\t mean {np.mean(mae_val):.2f}\t std {np.std(mae_val):.2f}')
    return return_val
    
trainings = train_model(x,y, LinearRegression(), n_splits=5, n_repeats=5, print_results=True)

RMSE train:	 mean 258.53	 std 12.13
RMSE val:	 mean 361.61	 std 52.83

MAE train:	 mean 167.69	 std 6.01
MAE val:	 mean 214.92	 std 17.91


In [11]:
cols_split_model = ['tamano', 'n_habitaciones', 'vacacional', 'puntuacion', 'puntuacion_lujo', 'puntuacion_lujo_2', 'palabras_bonitas', 'centro_coruna', 'maianca']
x = ds[cols_split_model]
y = y

### Binary class price

In [18]:
def train_model_binary_eval(x,y, model, boundary, n_splits=5, n_repeats=5, print_results=False):
    return_val = []
    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
    rkf.get_n_splits(x, y)
    for i, (train_index, val_index) in enumerate(rkf.split(x)):
        x_train = x.iloc[train_index]
        y_train = y.iloc[train_index]
        x_val = x.iloc[val_index]
        y_val = y.iloc[val_index]
        model.fit(x_train, y_train)

        y_train = y_train > boundary
        y_val = y_val > boundary

        y_train_pred = model.predict(x_train) > boundary
        y_val_pred = model.predict(x_val) > boundary

        # count true positive
        tp_train = sum(y_train_pred * y_train)
        tp_val = sum(y_val_pred * y_val)

        # count false positive
        fp_train = sum(y_train_pred & ~y_train)
        fp_val = sum(y_val_pred & ~y_val)

        # count false negative
        fn_train = sum(~y_train_pred & y_train)
        fn_val = sum(~y_val_pred & y_val)

        # count true negative
        tn_train = sum(~y_train_pred & ~y_train)
        tn_val = sum(~y_val_pred & ~y_val)

        # calculate precision
        precision_train = tp_train / (tp_train + fp_train)

        # calculate recall
        recall_train = tp_train / (tp_train + fn_train)

        # calculate f1 score
        f1_train = 2 * (precision_train * recall_train) / (precision_train + recall_train)

        # calculate precision
        precision_val = tp_val / (tp_val + fp_val)

        # calculate recall
        recall_val = tp_val / (tp_val + fn_val)

        # calculate f1 score
        f1_val = 2 * (precision_val * recall_val) / (precision_val + recall_val)

        train = {'tp': tp_train, 'fp': fp_train, 'fn': fn_train, 'tn': tn_train, 'precision': precision_train, 'recall': recall_train, 'f1': f1_train}
        val = {'tp': tp_val, 'fp': fp_val, 'fn': fn_val, 'tn': tn_val, 'precision': precision_val, 'recall': recall_val, 'f1': f1_val}
        return_val.append({'train': train, 'val': val})

    if print_results:
        precision_train = [x['train']['precision'] for x in return_val]
        precision_val = [x['val']['precision'] for x in return_val]
        f1_train = [x['train']['f1'] for x in return_val]
        f1_val = [x['val']['f1'] for x in return_val] 

        print(f'Precision train:\t mean {np.mean(precision_train):.2f}\t std {np.std(precision_train):.2f}')
        print(f'Precision val:\t mean {np.mean(precision_val):.2f}\t std {np.std(precision_val):.2f}')
        print("")
        print(f'F1 train:\t mean {np.mean(f1_train):.2f}\t std {np.std(f1_train):.2f}')
        print(f'F1 val:\t mean {np.mean(f1_val):.2f}\t std {np.std(f1_val):.2f}')
    return return_val

In [14]:
trainings = train_model(x,y, LinearRegression(), n_splits=5, n_repeats=5, print_results=True)

RMSE train:	 mean 334.28	 std 15.12
RMSE val:	 mean 344.06	 std 55.58

MAE train:	 mean 213.44	 std 7.10
MAE val:	 mean 219.38	 std 19.40


In [19]:
trainings = train_model_binary_eval(x,y, LinearRegression(), 1500, n_splits=5, n_repeats=5, print_results=True)

Precision train:	 mean 0.66	 std 0.03
Precision val:	 mean 0.64	 std 0.15

F1 train:	 mean 0.65	 std 0.03
F1 val:	 mean 0.63	 std 0.12


In [21]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model = LinearRegression()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print(y_test[(y_test>1500)&(y_test_pred<1500)], '---' , y_test_pred[(y_test>1500)&(y_test_pred<1500)])
print(y_test[(y_test<1500)&(y_test_pred>1500)], '---' , y_test_pred[(y_test<1500)&(y_test_pred>1500)])

472    1800.0
560    1890.0
481    1850.0
471    2400.0
Name: precio, dtype: float64 --- [ 778.31161847 1268.62137509 1490.96773458 1127.22786273]
602    1000.0
452    1100.0
557    1250.0
Name: precio, dtype: float64 --- [1585.04123192 1789.06330059 1618.99175008]


##### XGB

In [32]:
acc_train = []
acc_val = []
f1_train = []
f1_val = []

yy = y > 1500

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(x, yy, test_size=0.1)
    class_ratio = np.sum(y_train == 0) / np.sum(y_train == 1)
    clf = xgb.XGBClassifier(
        objective='binary:logistic',  # For binary classification
        max_depth=3,  # Maximum depth of a tree
        learning_rate=0.1,  # Step size shrinkage
        n_estimators=100,  # Number of boosting rounds (trees)
        n_jobs=-1,  # Use all available cores
        random_state=42,  # For reproducibility
        scale_pos_weight=class_ratio
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy_train = accuracy_score(y_train, clf.predict(X_train))
    accuracy_val = accuracy_score(y_test, y_pred)
    f1_tr = f1_score(y_train, clf.predict(X_train))
    f1_va = f1_score(y_test, y_pred)
    acc_train.append(accuracy_train)
    acc_val.append(accuracy_val)
    f1_train.append(f1_tr)
    f1_val.append(f1_va)

print(f'Accuracy train:\t mean {np.mean(acc_train):.2f}\t std {np.std(acc_train):.2f}')
print(f'Accuracy val:\t mean {np.mean(acc_val):.2f}\t std {np.std(acc_val):.2f}')
print("")
print(f'F1 train:\t mean {np.mean(f1_train):.2f}\t std {np.std(f1_train):.2f}')
print(f'F1 val:\t mean {np.mean(f1_val):.2f}\t std {np.std(f1_val):.2f}')

Accuracy train:	 mean 0.97	 std 0.01
Accuracy val:	 mean 0.93	 std 0.03

F1 train:	 mean 0.83	 std 0.02
F1 val:	 mean 0.63	 std 0.13


In [62]:
from sklearn.model_selection import LeaveOneOut

acc_train = []
acc_val = []
f1_train = []
f1_val = []

y_test_list = []
y_pred_list = []

yy = y > 1500

loo = LeaveOneOut()
for i, (train_index, test_index) in enumerate(loo.split(x)):
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = yy.iloc[train_index], yy.iloc[test_index]
    class_ratio = np.sum(y_train == 0) / np.sum(y_train == 1)
    clf = xgb.XGBClassifier(
        objective='binary:logistic',  # For binary classification
        max_depth=3,  # Maximum depth of a tree
        learning_rate=0.1,  # Step size shrinkage
        n_estimators=100,  # Number of boosting rounds (trees)
        n_jobs=-1,  # Use all available cores
        random_state=42,  # For reproducibility
        scale_pos_weight=class_ratio
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_test_list.append(y_test.values[0])
    y_pred_list.append(y_pred[0])

In [63]:
accuracy_test = accuracy_score(y_test_list, y_pred_list)
f1_test = f1_score(y_test_list, y_pred_list)

print(f'Accuracy test:\t mean {accuracy_test:.2f}')
print(f'F1 test:\t mean {f1_test:.2f}')

Accuracy test:	 mean 0.92
F1 test:	 mean 0.60


In [74]:
split_model = clf

x = ds[cols_model]
x = x.drop('precio', axis=1)

xx = x[cols_split_model]
predict_classes = split_model.predict(xx)
x_low = xx.iloc[np.where(predict_classes == 0)[0]]
y_low = y.iloc[np.where(predict_classes == 0)[0]]
x_high = xx.iloc[np.where(predict_classes == 1)[0]]
y_high = y.iloc[np.where(predict_classes == 1)[0]]

trainings = train_model(x_low, y_low, LinearRegression(), n_splits=5, n_repeats=5, print_results=True)
print("")
print("")
print("")
trainings = train_model(x_high, y_high, LinearRegression(), n_splits=5, n_repeats=5, print_results=True)

RMSE train:	 mean 172.50	 std 3.52
RMSE val:	 mean 176.87	 std 14.13

MAE train:	 mean 129.69	 std 2.68
MAE val:	 mean 132.89	 std 10.14



RMSE train:	 mean 616.73	 std 35.07
RMSE val:	 mean 747.26	 std 140.02

MAE train:	 mean 473.03	 std 22.39
MAE val:	 mean 579.37	 std 85.13


In [76]:
(len(x_low)*176 + len(x_high)*747)/(len(x_low) + len(x_high))

248.55801104972375