# 1. Library importation
Here the libraries needed for the data manipulation, model and metrics are imported.

In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# 2. Data preprocessing

In [None]:
numerical_cols = ['year', 'odometer', 'lat', 'long']
categoricalOrdinal_cols = ['model']
categoricalHot_cols = ['manufacturer','fuel','drive','type']

## 2.1 Data reading
The csv is transformed into a pandas dataframe so we can work with it.


In [None]:
cols_to_keep = numerical_cols+categoricalOrdinal_cols+categoricalHot_cols+['price']
df = pd.read_csv('csvs/vehicles.csv', usecols=cols_to_keep)

## 2.2 Price filtering
The unreasonably low/high prices are excluded from the dataframe in order to avoid inaccurate predictions.

In [None]:
df = df[ (df['price'] > 500) & (df['price'] < 100000) ]

## 2.3 Target and features
The target and features columns are split up.

In [None]:
y=df['price']


In [None]:
X=df[numerical_cols+categoricalHot_cols+categoricalOrdinal_cols]

## 2.4 Imputer
A `SimpleImputer` is created in order to organize numerical data.

In [None]:
imputer = SimpleImputer()

## 2.5 Encoders
Pipelines for categorical data are created. Both firstly use a SimpleImputer to replace null values with the most frequent of said column. Then either a `OneHotEncoder` (for data with lower variety of elements) or an `OrdinalEncoder` (for data with a higher variety of elements) is used in order to make categorical values usable by the model.

In [None]:
hot_encoder = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('hot_encoder', OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [None]:
ordinal_encoder = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

## 2.6 Preprocessor
A preprocessor using `ColumnTransformer` is created in order to apply the specific transformations simultaneously, concatenating their results into a final dataset ready for the model.

In [None]:
preprocessor = ColumnTransformer([('numerical',imputer,numerical_cols),
                                  ('ordinal',ordinal_encoder,categoricalOrdinal_cols),
                                  ('hot', hot_encoder,categoricalHot_cols)
                                  ])

## 2.7 Splitting data
`train_test_split` is used in order to randomly select data for training and validation.

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,random_state=1)

## 2.8 Data transformation
The preprocessor is trained only with the training data and then transforms both the valid and train features.

In [None]:
preprocessor.fit(X_train)
X_valid=preprocessor.transform(X_valid)
X_train=preprocessor.transform(X_train)

# 3. Model training
The `XGBRegressor` model is created with the most optimized hyperparameters.

In [None]:
final_model = XGBRegressor(n_jobs=-1,learning_rate=0.03,n_estimators=5000,early_stopping_rounds=50,max_depth=12,random_state=1,min_child_weight=3,subsample=0.7,colsample_bytree=0.7)

## 3.1 Model fitting
The model trains with the training data while using the validation data as its `eval_set`.

In [None]:
final_model.fit(X_train,y_train,verbose=False,eval_set=[(X_valid,y_valid)])

# 4. Results
Both the `mean_absolute_error` and `mean_absolute_percentage_error` of the train and validation data are calculated so it is easier to see if any overfitting happened.

In [None]:
predictions = final_model.predict(X_valid)
mae=mean_absolute_error(predictions,y_valid)
train_preds=final_model.predict(X_train)
mae_train=mean_absolute_error(train_preds,y_train)
print(f'mae valid: {mae}')
print(f'mae train: {mae_train}')
mape=mean_absolute_percentage_error(predictions,y_valid)
print(mape)
mape_train=mean_absolute_percentage_error(train_preds,y_train)
print(mape_train)


# 5 Exporting
Saves the files to use it in FastAPI.

In [None]:
joblib.dump(preprocessor, 'car_preprocessor.pkl')
joblib.dump(final_model,'car_price_model.pkl')