In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

import os
import wandb
from dotenv import load_dotenv

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score

import joblib
import pickle

from sklearn.impute import SimpleImputer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    PolynomialFeatures,
    FunctionTransformer,
)

# Definamos el `random_state`:
random_state = 42

# Cambiemos la fuente de las gráficas de matplotlib:
plt.rc('font', family='serif', size=12)

In [2]:
#Data ingestion
DATASET_LOC = '../data/'
file0 = 'test_file.csv'
#file1 = 'data1.csv'

df0 = pd.read_csv(os.path.join(DATASET_LOC,file0))
#df1 = pd.read_csv(os.path.join(DATASET_LOC,file1))
df = pd.concat([df0])
#df = pd.concat([df0,df1])
df.sample(10)

FileNotFoundError: [Errno 2] No such file or directory: '../data/test_file.csv'

In [49]:
#Load preparation parameters
class MergeElectricToOther(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy[self.column_name] = X_copy[self.column_name].replace('Electric', 'Other')
        return X_copy
        
preparation_file = os.path.join(DATASET_LOC,'preparation', 'preparation_params.pkl') #Parámetros
prep_params = joblib.load(preparation_file)

#Load preprocessing and model pipelines
preprocessor_file = os.path.join(DATASET_LOC,'model', 'preprocessor.pkl') #Parámetros
preprocessor = joblib.load(preprocessor_file)

model_file = os.path.join(DATASET_LOC,'model', 'model.pkl') #Parámetros
model = joblib.load(model_file)

In [50]:
test_df = df.drop(columns=prep_params['columns_to_drop'])
# Threshold to remove samples having missing values greater than threshold
max_nulls_allowed = prep_params['max_nulls_allowed']
# Getting Missing count of each sample            
nulls_per_row = test_df.isnull().sum(axis=1)
# Filter out rows with more than the allowed number of null values
test_df = test_df[nulls_per_row <= max_nulls_allowed]
# Eliminemos los registros con tamaño del motor menor a 1 litro:
test_df.drop(index=test_df[test_df.engineSize < prep_params["min_engineSize"]].index, inplace=True)
# Eliminemos el registro con año mayor que 2020:
test_df.drop(index=test_df[test_df['year'] > prep_params["max_year"]].index, inplace=True)
# Apliquemos el pipeline para combinar las columnas
test_df_transformed = prep_params["pipeline"].transform(test_df)

In [51]:
test_df_transformed

Unnamed: 0,brand_model,year,price,transmission,mileage,fuelType,tax,milesPerGallon,engineSize
0,Ford_Focus,2017,11498,Manual,18150,Petrol,20,60.1,1.0
1,VW_Touareg,2019,40995,Semi-Auto,1000,Diesel,145,34.5,3.0
2,VW_Tiguan,2017,23495,Semi-Auto,14839,Diesel,145,57.6,2.0
3,VW_Touareg,2018,37381,Semi-Auto,19718,Diesel,145,34.0,3.0
4,VW_Up,2017,7488,Manual,28449,Petrol,150,64.2,1.0
...,...,...,...,...,...,...,...,...,...
6620,VW_Golf,2019,20980,Semi-Auto,5398,Petrol,145,43.5,1.5
6621,Ford_Focus,2019,17699,Manual,5443,Petrol,145,60.1,1.0
6622,VW_Golf,2019,23940,Automatic,9535,Diesel,145,49.6,2.0
6623,Ford_EcoSport,2018,14500,Manual,15644,Petrol,145,54.3,1.0


In [52]:
X_test = test_df[preprocessor["cols"]]
y_test = test_df["price"]

In [53]:
X_test = preprocessor["preprocessor"].transform(test_df_transformed)

In [59]:
print("Model: Gradient boosting")
print(f'R^2: {model.score(X_test, y_test)}')
print(f'MAE: {mean_absolute_error(y_test, model.predict(X_test))}')
print('\n')

Model: Gradient boosting
R^2: 0.9699334356710758
MAE: 798.6473551525968




In [63]:
df = pd.DataFrame({'y_test':y_test, 'y_pred':model.predict(X_test)})
df

Unnamed: 0,y_test,y_pred
0,11498,10757.904938
1,40995,41974.687354
2,23495,22082.550870
3,37381,35503.576731
4,7488,7233.115601
...,...,...
6620,20980,21278.574743
6621,17699,16835.172585
6622,23940,25101.448791
6623,14500,13583.196652
