In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import set_config

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

# Load Data

In [4]:
sale_data = pd.read_csv('data_available/sale_data_post_out.csv', sep=',')
sale_data.shape

(7387, 15)

In [5]:
sale_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7387 entries, 0 to 7386
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   daft_id          7387 non-null   int64  
 1   url              7387 non-null   object 
 2   name             7387 non-null   object 
 3   price            7387 non-null   float64
 4   sale_type        7387 non-null   object 
 5   floor_area       7387 non-null   int64  
 6   entered_renewed  7387 non-null   object 
 7   views            7387 non-null   float64
 8   type_house       7387 non-null   object 
 9   type             7387 non-null   object 
 10  scraping_date    7387 non-null   object 
 11  latitude         7387 non-null   float64
 12  longitude        7387 non-null   float64
 13  bedroom          7387 non-null   int64  
 14  bathroom         7387 non-null   int64  
dtypes: float64(4), int64(4), object(7)
memory usage: 865.8+ KB


In [6]:
sale_data.sample()

Unnamed: 0,daft_id,url,name,price,sale_type,floor_area,entered_renewed,views,type_house,type,scraping_date,latitude,longitude,bedroom,bathroom
3666,15707860,https://www.daft.ie/for-sale/semi-detached-hou...,"61 Fán Glas, Kilmeadan, Co. Waterford",250000.0,For Sale by Private Treaty,107,2021-09-24,5902.0,house,buy,2021-10-19,52.229988,-7.234722,3,3


-----------

-----

-------

# Split Data

In [9]:
def split_train_test(df, test_ratio=.2):
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]

In [11]:
train_set, test_set = split_train_test(sale_data, .2)
print(train_set.shape, test_set.shape)

(5910, 15) (1477, 15)


Estos los guardaria en dos archivos

-----------------

In [12]:
y_train = train_set['price'].copy()
y.shape

(5910,)

In [14]:
print(train_set.shape)
X_train = train_set[['floor_area', 'bedroom', 'bathroom']]
print(X_train.shape)

(5910, 15)
(5910, 3)


In [26]:
def split_x_y(train_set, test_set, features, target='price'):
    y_train = train_set[target].copy()
    X_train = train_set[features].copy()
    y_test = test_set[target].copy()
    X_test = test_set[features].copy()
    print('X_train:', X_train.shape, '\n' + 
          'X_test:', X_test.shape, '\n' + 
          'y_train:', y_train.shape, '\n' + 
          'y_test:', y_test.shape, '\n')
    return X_train, X_test, y_train, y_test

In [27]:
features = ['floor_area', 'bedroom', 'bathroom']

X_train, X_test, y_train, y_test = split_x_y(train_set, test_set, features, target='price')

X_train: (5910, 3) 
X_test: (1477, 3) 
y_train: (5910,) 
y_test: (1477,) 



In [28]:
X_train.columns

Index(['floor_area', 'bedroom', 'bathroom'], dtype='object')

# Transform Data

## Transformation Pipeline

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [38]:
num_pipe = Pipeline([
    ('std_scaler', StandardScaler()),
    ])

In [39]:
cat_pipe = Pipeline([
    ('one_hot_encoder', OneHotEncoder()),
    ])

In [40]:
num_features = ['floor_area', 'bedroom', 'bathroom']

In [41]:
preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),  # ['place']
    ])

## Linear Regression

In [43]:
from sklearn.linear_model import LinearRegression

In [44]:
lr = Pipeline(steps=[('preprocessor', preprocessor),
                     ('regressor', LinearRegression())
                    ])

In [45]:
lr.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('std_scaler',
                                                                   StandardScaler())]),
                                                  ['floor_area', 'bedroom',
                                                   'bathroom'])])),
                ('regressor', LinearRegression())])

In [46]:
y_pred = lr.predict(X_test)

In [47]:
def metrics_regression(y_test, y_pred, squared=False):
    
    r2_score = metrics.r2_score(y_test, y_pred)
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = metrics.mean_squared_error(y_test, y_pred, squared=squared)
    
    print(f'R²: {r2_score}')
    print(f'MAE: {mae}')
    print(f'MAPE: {mape}')
    #print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')

In [49]:
from sklearn import metrics

metrics_regression(y_test=y_test, y_pred=y_pred, squared=False)

R²: 0.2542097152285159
MAE: 166846.74095583672
MAPE: 0.5284714299128194
RMSE: 255348.63731458155


In [50]:
def cross_validate_custom(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False):
    from sklearn.model_selection import cross_validate
    estimator = estimator
    scoring_dict = scoring_dict

    scores = cross_validate(estimator, X=X_train, y=y_train, scoring=scoring_dict, cv=cv, 
                            return_train_score=return_train_score)
    print(scores.keys())
    return scores

In [51]:
def scores_statistics(estimator, scoring_dict, X_train, y_train, cv=10, return_train_score=False, time_info=False):
    
    scores = cross_validate(estimator, X=X_train, y=y_train, scoring=scoring_dict, cv=cv, 
                            return_train_score=return_train_score)
    
    if time_info:
        fit_time_mean = np.mean(scores['fit_time'])
        fit_time_std = np.std(scores['fit_time'])
        score_time_mean = np.mean(scores['score_time'])
        score_time_std = np.std(scores['score_time'])
        #time_list = []    
        print('fit_time mean:', fit_time_mean)
        print('fit_time std:', fit_time_std)
        print('score_time mean:', score_time_mea)
        print('score_time std:', score_time_std)
    
    
    for key in scoring_dict:
        try:
            mean = np.mean(scores['test_' + key])
            std = np.std(scores['test_' + key])
            print(key, 'mean:', mean)
            print(key, 'std:', std, '\n')
        except:
            continue
    return scores

In [53]:
from sklearn.model_selection import cross_validate

scoring = {'r2': 'r2', 
           'MAE': 'neg_mean_absolute_error', 
           'MAPE': 'neg_mean_absolute_percentage_error', 
           'RMSE': 'neg_root_mean_squared_error'}

lr = Pipeline(steps=[('preprocessor', preprocessor),
                     ('regressor', LinearRegression())
                    ])

scores_statistics(estimator=lr, scoring_dict=scoring, 
                  X_train=X_train, y_train=y_train, cv=5, return_train_score=False)

r2 mean: 0.29015336342152354
r2 std: 0.009215464095573198 

MAE mean: -167712.81995603567
MAE std: 4528.833112565178 

MAPE mean: -0.536058383358235
MAPE std: 0.016745884098919032 

RMSE mean: -258771.62969879303
RMSE std: 10770.034783947762 



{'fit_time': array([0.0052197 , 0.00537157, 0.00509667, 0.00530553, 0.00535297]),
 'score_time': array([0.00369596, 0.00329375, 0.0032599 , 0.00376558, 0.00315285]),
 'test_r2': array([0.27794005, 0.28555833, 0.29736269, 0.28611264, 0.30379311]),
 'test_MAE': array([-175624.64366511, -167044.72492564, -164488.1871388 ,
        -162460.31667128, -168946.22737934]),
 'test_MAPE': array([-0.54578378, -0.56332638, -0.5260799 , -0.51549673, -0.52960513]),
 'test_RMSE': array([-275243.84501202, -259215.61307947, -258637.00188986,
        -241226.37643073, -259535.31208189])}