In [155]:
# bibliotecas
import pandas as pd
import time
# funções 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# modelos
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

|             | Random Forest(est= 11, depth=27)   | Random Forest(est=100, depth=27)  | LinearRegression(fit_intercept, jobs=-1)   |
|-------------|------------|------------|------------|
| train_score     | 🟨0.9698145941866823    | 🟩0.975808533590373    | 🟥0.720336930225542    |
| test_score     | 🟨0.8547403645319819    |🟩0.8656546507386057    | 🟥0.7211501035967206    |
| MAE($)  | 🟨1872.6269980729571    | 🟩1785.8674366448959    | 🟥2838.1486088502406    |
| time(s)     | 🟨8.10574460029602    | 🟥66.94056940078735    | 🟩0.4054858684539795    |

# Random Forest
legenda: treino, teste, erro, tempo<br><br>
n_estimators=11, max_depth=27 -> 0.9557926155884157 0.8334794802644814 3931.966016272742 8.193240880966187<br> <br>
n_estimators=100, max_depth=27 -> 0.9650129200912126 0.8408169218841756 3873.6263369273256 79.8025176525116<br>
# Linear Regression
legenda: treino, teste, erro<br><br>
fit_intercept=True, n_jobs=-1 -> 0.6463556388680558 0.5734454636073181 6866.667619648236<br>

### Célula usada para encontrar os melhores hiperparâmetros da Random Forest

```best_error = float('inf')
best_est = 0
best_depth = 0

for est in range(10, 101, 5):
    for depth in range(20, 31):
        model = RandomForestRegressor(random_state=1506, n_estimators=est, max_depth=depth)
        model.fit(features_train, target_train)
        predictions_test = model.predict(features_test)
        error = mean_squared_error(target_test, predictions_test)**0.5
        print(f"Validação de REQM para n_estimators={est}, depth={depth} é {error}")
        if error < best_error:
            best_error = error
            best_est = est
            best_depth = depth

print(f"Melhor REQM de teste: {best_error} com n_estimators={best_est} e max_depth={best_depth}")
```

# Melhor modelo

In [156]:
# carregando dataset
car_data = pd.read_csv('datasets/car_data.csv')

In [157]:
# retirando colunas que não serão utilizadas
car_data = car_data.drop(['date_posted', 'days_listed', 'year_posted', 'car_age'] , axis=1)

In [158]:
# Identificar outliers usando IQR
Q1 = car_data['price'].quantile(0.25)
Q3 = car_data['price'].quantile(0.75)
IQR = Q3 - Q1

# Filtrar outliers
car_data = car_data[~((car_data['price'] < (Q1 - 1.5 * IQR)) | (car_data['price'] > (Q3 + 1.5 * IQR)))]

In [159]:
# transformando a coluna condition em numérica colocando "pesos" para cada condição
condition_mapping = {
    "new": 5,
    "like new": 4,
    "excellent": 3,
    "good": 2,
    "fair": 1,
    "salvage": 0
}

# atribuindo os valores numéricos ao dataset
car_data['condition'] = car_data['condition'].map(condition_mapping)

In [160]:
# separando features e target
features = car_data.drop('price', axis=1)
target = car_data['price']

In [161]:
# aplicando One-Hot Encoding nas variáveis categóricas restantes
features_encoded = pd.get_dummies(features, columns=['model', 'fuel', 'transmission', 'type', 'paint_color', 'brand'])

In [162]:
# separando dados de treino e teste
features_train, features_test, target_train, target_test = train_test_split(features_encoded, target, test_size=0.2, random_state=1506, shuffle=True)

In [163]:
model = RandomForestRegressor(random_state=1506, n_estimators=11, max_depth=27, n_jobs=-1)

In [164]:
start = time.time()
model.fit(features_train, target_train)
end = time.time()
tempo = end - start
tempo

5.704869747161865

In [165]:
print(model.score(features_train, target_train),
model.score(features_test, target_test))

0.9699817221823089 0.8545331007661833


In [166]:
predictions = model.predict(features_test)
print(predictions)

[19900.         28466.61043674  4698.45454545 ...  8359.08431477
  5525.14146465 13956.73783287]


In [167]:
print(target_test)

33882    19900
14809    34000
32496     1800
28582     2900
7332     25900
         ...  
36037    12000
49324     5200
22230     9900
49193     4900
38880    12500
Name: price, Length: 9759, dtype: int64


In [168]:
rmse = mean_squared_error(target_test, predictions)**0.5
mae = mean_absolute_error(target_test, predictions)
print(f' RMSE: {rmse} \n MAE: {mae}')

 RMSE: 2913.364558864547 
 MAE: 1795.9145966752583


# Salvando Dataset

In [169]:
car_data.to_csv('model.csv')

# Salvando Modelo

In [170]:
# Saving model to pickle file
with open("predict.pkl", "wb") as file: # file is a variable for storing the newly created file, it can be anything.
    pickle.dump(model, file) # Dump function is used to write the object into the created file in byte format.

In [171]:
# The model has now been deserialized, next is to make use of it as you normally would.
prediction = model_pkl.predict(features_test) # Passing in variables for prediction
print("O carro custa",prediction[0]) # Printing result

O carro custa 19900.0
