In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import math

In [60]:
#funciones auxiliares

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

def split_vals(a,n):
    return a[:n],a[n:]

def mape(Y_Predicted, Y_actual):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

def print_score(m):
    print("mape train {}".format(mape(m.predict(X_train),y_train)))
    print("mape x_valid {}".format(mape(m.predict(X_valid),y_valid)))
    print("mape test {}".format(mape(m.predict(X_test),y_test.values)))

def feat_importance(m,df_train):
    importance = m.feature_importances_
    importance = pd.DataFrame(importance,index=df_train.columns,columns=["Importance"])
    return importance.sort_values(by=['Importance'],ascending=False)

In [72]:
df = pd.read_csv('/content/gol.csv')#https://drive.google.com/file/d/143sUmCgyWaYd6BQ-byYzGpqdhsUKDZ5S/view?usp=sharing

In [62]:
# --- filtrado de datos

print("Registros antes del filtrado: {}".format(len(df)))

df = df[df.year>1993]

# quito los km extremos
df = df[(df.km > 1111) & (df.km < 800000)]
df[df.km != 111111]

# quito los que estan en pesos que hay algunos que ponen $1, otros que se equivocaron de moneda, etc
df = df[df.currency == 'USD']
print("Registros luego del filtrado: {}".format(len(df)))

# quito los precios extremos
df = df[(df.price > 1500) & (df.price < 96000)]

df.fillna(value={'transmission':'Manual'},inplace=True)



Registros antes del filtrado: 405
Registros luego del filtrado: 383


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383 entries, 0 to 404
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   383 non-null    int64  
 1   price                383 non-null    int64  
 2   currency             383 non-null    object 
 3   used                 383 non-null    object 
 4   engine_displacement  383 non-null    float64
 5   year                 383 non-null    int64  
 6   brand                383 non-null    object 
 7   model                383 non-null    object 
 8   doors                383 non-null    int64  
 9   traction_control     227 non-null    object 
 10  power                383 non-null    float64
 11  fuel_type            383 non-null    object 
 12  km                   383 non-null    float64
 13  transmission         383 non-null    object 
 14  trim                 383 non-null    object 
 15  permalink            383 non-null    obj

In [64]:
df_human = df.copy()

In [65]:
# --- entrenamiento del modelo

df = df[['price','engine_displacement','year','doors','km']]

df_raw_train, df_raw_test = train_test_split(df)

n_valid = 100
n_train = len(df_raw_train)-n_valid
X_train,X_valid = split_vals(df_raw_train.drop('price',axis=1),n_train)
y_train,y_valid = split_vals(df_raw_train['price'],n_train)
X_test = df_raw_test
y_test = X_test[['price']]
X_test = X_test.drop('price',axis=1)

m = RandomForestRegressor(n_jobs=1, oob_score=True)

m.fit(X_train, y_train)

print_score(m)

mape train 3.285896142675943
mape x_valid 7.376225197410554
mape test 40.76088144517076


In [66]:
# exploracion de datos

importance = feat_importance(m,X_train)
importance[:]

Unnamed: 0,Importance
year,0.870602
km,0.084196
doors,0.026642
engine_displacement,0.018561


In [67]:
df_human['predicted'] = m.predict(df.drop(columns='price'))

In [68]:
df_human['predicted'] = df_human.predicted.apply(lambda x: round(x))
df_human['abs_diff'] = df_human.apply(lambda row: round(abs(row['price'] - row['predicted'])), axis=1)
df_human['diff'] = df_human.apply(lambda row: round(row['predicted'] - row['price']), axis=1)

In [69]:
f = df_human.sort_values(by='diff',ascending=False)
f[['brand', 'model','doors','km','price', 'predicted','year','diff','permalink']].head(100)

Unnamed: 0,brand,model,doors,km,price,predicted,year,diff,permalink
274,Volkswagen,Gol,5,97000.0,8500,11009,2013,2509,https://auto.mercadolibre.com.uy/MLU-480982010...
30,Volkswagen,Gol,5,120000.0,7900,10231,2013,2331,https://auto.mercadolibre.com.uy/MLU-480411436...
170,Volkswagen,Gol,3,11111.0,4000,6309,2000,2309,https://auto.mercadolibre.com.uy/MLU-480810783...
70,Volkswagen,Gol,3,225993.0,3690,5650,1997,1960,https://auto.mercadolibre.com.uy/MLU-480708588...
150,Volkswagen,Gol,5,40000.0,4200,6138,1998,1938,https://auto.mercadolibre.com.uy/MLU-480438450...
...,...,...,...,...,...,...,...,...,...
349,Volkswagen,Gol,5,130000.0,9000,9387,2013,387,https://auto.mercadolibre.com.uy/MLU-480280902...
244,Volkswagen,Gol,5,81000.0,11800,12185,2016,385,https://auto.mercadolibre.com.uy/MLU-480758216...
22,Volkswagen,Gol,3,98000.0,7990,8369,2009,379,https://auto.mercadolibre.com.uy/MLU-480642911...
340,Volkswagen,Gol,5,115776.0,12690,13057,2017,367,https://auto.mercadolibre.com.uy/MLU-479634932...


In [70]:
f[['brand', 'model','doors','km','price', 'predicted','year','diff','permalink']].head(100).to_csv('oportunidades.csv',index=False)

In [71]:
f.head(15).permalink.values

array(['https://auto.mercadolibre.com.uy/MLU-480982010-volkswagen-gol-16-101cv-2013-_JM',
       'https://auto.mercadolibre.com.uy/MLU-480411436-volkswagen-gol-14-power-psac-83cv-2013-_JM',
       'https://auto.mercadolibre.com.uy/MLU-480810783-volkswagen-gol-2000-10-gl-_JM',
       'https://auto.mercadolibre.com.uy/MLU-480708588-volkswagen-gol-16-nafta-buen-estado-48-cuotas-_JM',
       'https://auto.mercadolibre.com.uy/MLU-480438450-volkswagen-gol-1998-10-gl-5-p-_JM',
       'https://auto.mercadolibre.com.uy/MLU-478790829-volkswagen-gol-16-power-101cv-2015-_JM',
       'https://auto.mercadolibre.com.uy/MLU-481112923-volkswagen-gol-16-pack-i-101cv-2012-_JM',
       'https://auto.mercadolibre.com.uy/MLU-480426308-volkswagen-gol-16-i-power-601-2008-_JM',
       'https://auto.mercadolibre.com.uy/MLU-480831214-volkswagen-gol-16-ano-2013-sedan-full-al-dia-7900-dolares-_JM',
       'https://auto.mercadolibre.com.uy/MLU-481093742-volkswagen-gol-16-power-101cv-2017-_JM',
       'https://auto.