In [2]:
# bibliotecas
import pandas as pd
import time
import pickle

# funções 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# modelos
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

|             | Random Forest(est= 11, depth=27)   | Random Forest(est=100, depth=27)  | LinearRegression(fit_intercept, jobs=-1)   |
|-------------|------------|------------|------------|
| train_score     | 🟨0.9699817221823089    | 🟩0.9764383475241958    | 🟥0.720336930225542    |
| test_score     | 🟨0.8545331007661833    |🟩0.8642377492706509    | 🟥0.7211501035967206    |
| MAE($)  | 🟨1795.9145966752583    | 🟩1730.1057586683519    | 🟥2838.1486088502406    |
| time(s)     | 🟨8.10574460029602    | 🟥66.94056940078735    | 🟩0.4054858684539795    |

### Célula usada para encontrar os melhores hiperparâmetros da Random Forest

```best_error = float('inf')
best_est = 0
best_depth = 0

for est in range(10, 101, 5):
    for depth in range(20, 31):
        model = RandomForestRegressor(random_state=1506, n_estimators=est, max_depth=depth)
        model.fit(features_train, target_train)
        predictions_test = model.predict(features_test)
        error = mean_squared_error(target_test, predictions_test)**0.5
        print(f"Validação de REQM para n_estimators={est}, depth={depth} é {error}")
        if error < best_error:
            best_error = error
            best_est = est
            best_depth = depth

print(f"Melhor REQM de teste: {best_error} com n_estimators={best_est} e max_depth={best_depth}")
```

# Melhor modelo

In [3]:
# carregando dataset
car_data = pd.read_csv('datasets/car_data.csv')

In [4]:
# retirando colunas que não serão utilizadas
car_data = car_data.drop(['date_posted', 'days_listed', 'year_posted', 'car_age'] , axis=1)

In [5]:
# Identificar outliers usando IQR
Q1 = car_data['price'].quantile(0.25)
Q3 = car_data['price'].quantile(0.75)
IQR = Q3 - Q1

# Filtrar outliers
car_data = car_data[~((car_data['price'] < (Q1 - 1.5 * IQR)) | (car_data['price'] > (Q3 + 1.5 * IQR)))]

In [6]:
# trocar todos os espaços por underlines
car_data.columns = car_data.columns.str.replace(' ', '_') 
# trocar todos os espaços por underlines nas linhas
car_data['model'] = car_data['model'].str.replace(' ', '_')


In [7]:
# transformando a coluna condition em numérica colocando "pesos" para cada condição
condition_mapping = {
    "new": 5,
    "like new": 4,
    "excellent": 3,
    "good": 2,
    "fair": 1,
    "salvage": 0
}

# atribuindo os valores numéricos ao dataset
car_data['condition'] = car_data['condition'].map(condition_mapping)

In [8]:
# separando features e target
features = car_data.drop('price', axis=1)
target = car_data['price']

In [9]:
# aplicando One-Hot Encoding nas variáveis categóricas restantes
features_encoded = pd.get_dummies(features, columns=['model', 'fuel', 'transmission', 'type', 'paint_color', 'brand'])

In [10]:
# separando dados de treino e teste
features_train, features_test, target_train, target_test = train_test_split(features_encoded, target, test_size=0.2, random_state=1506, shuffle=True)

In [11]:
model = RandomForestRegressor(random_state=1506, n_estimators=11, max_depth=27, n_jobs=-1)

In [12]:
start = time.time()
model.fit(features_train, target_train)
end = time.time()
tempo = end - start
tempo

3.8985562324523926

In [13]:
print(model.score(features_train, target_train),
model.score(features_test, target_test))

0.9699817221823089 0.8545331007661833


In [14]:
predictions = model.predict(features_test)
print(predictions)

[19900.         28466.61043674  4698.45454545 ...  8359.08431477
  5525.14146465 13956.73783287]


In [15]:
print(target_test)

33882    19900
14809    34000
32496     1800
28582     2900
7332     25900
         ...  
36037    12000
49324     5200
22230     9900
49193     4900
38880    12500
Name: price, Length: 9759, dtype: int64


In [16]:
rmse = mean_squared_error(target_test, predictions)**0.5
mae = mean_absolute_error(target_test, predictions)
print(f' RMSE: {rmse} \n MAE: {mae}')

 RMSE: 2913.364558864547 
 MAE: 1795.9145966752583


# Salvando Dataset

In [17]:
car_data = car_data.drop(['price'], axis=1)

# Salvando Modelo

In [18]:
#car_data.to_csv('model.csv', index=False)

In [19]:
df_structure = features_encoded.drop(features_encoded.index)

In [20]:
#df_structure.to_csv('structure.csv', index=False)

In [21]:
# Saving model to pickle file
with open("predict.pkl", "wb") as file: # file is a variable for storing the newly created file, it can be anything.
    pickle.dump(model, file) # Dump function is used to write the object into the created file in byte format.

In [22]:
features_train.head()

Unnamed: 0,model_year,condition,cylinders,odometer,is_4wd,model_acura_tl,model_bmw_x5,model_buick_enclave,model_cadillac_escalade,model_chevrolet_camaro,...,brand_gmc,brand_honda,brand_hyundai,brand_jeep,brand_kia,brand_nissan,brand_ram,brand_subaru,brand_toyota,brand_volkswagen
36479,2011,3,8,181318,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
30942,2018,4,6,37650,0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
19752,2009,2,6,133000,0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
32121,2001,1,4,200000,0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
38071,2017,3,8,50000,0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [23]:
dft = features_encoded.drop(features_encoded.index)
dft.head()

Unnamed: 0,model_year,condition,cylinders,odometer,is_4wd,model_acura_tl,model_bmw_x5,model_buick_enclave,model_cadillac_escalade,model_chevrolet_camaro,...,brand_gmc,brand_honda,brand_hyundai,brand_jeep,brand_kia,brand_nissan,brand_ram,brand_subaru,brand_toyota,brand_volkswagen


In [24]:
input_model = ["bmw","bmw x5",2011,0,6,"gas","automatic","SUV","unknown",1,10000]

In [25]:
columns_user = [f'brand_{input_model[0]}', f'model_{input_model[1]}', 'model_year', 'condition', 'cylinders', f'fuel_{input_model[5]}', f'transmission_{input_model[6]}', f'type_{input_model[7]}', f'paint_color_{input_model[8]}', 'is_4wd', 'odometer']
columns_user

['brand_bmw',
 'model_bmw x5',
 'model_year',
 'condition',
 'cylinders',
 'fuel_gas',
 'transmission_automatic',
 'type_SUV',
 'paint_color_unknown',
 'is_4wd',
 'odometer']

In [26]:
dftt = pd.DataFrame([input_model], columns=columns_user)
df_final = pd.concat([dft, dftt], axis=0)
df_final = df_final.fillna(False)
df_final.head()

  df_final = df_final.fillna(False)


Unnamed: 0,model_year,condition,cylinders,odometer,is_4wd,model_acura_tl,model_bmw_x5,model_buick_enclave,model_cadillac_escalade,model_chevrolet_camaro,...,brand_honda,brand_hyundai,brand_jeep,brand_kia,brand_nissan,brand_ram,brand_subaru,brand_toyota,brand_volkswagen,model_bmw x5
0,2011,0,6,10000,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,bmw x5
