## 0.0. Imports

In [172]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import seaborn as sns
import plotly.express as px
import sweetviz as sv
import pickle
from boruta import BorutaPy
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR




pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
df_raw = pd.read_csv('../data/treino.csv')
df_test = pd.read_csv('../data/teste.csv')

In [203]:
# stratified cross validation
def cross_validation(model_name, model, x_train, y_train):

    rmse_list = []
    mae_list = []
    mape_list = []

    skf = KFold(n_splits=5, shuffle=True, random_state=5)

    for train_index, test_index in skf.split(x_train, y_train):

        x_train_cv = x_train.iloc[train_index]
        y_train_cv = y_train.iloc[train_index]

        x_test_cv = x_train.iloc[test_index]
        y_test_cv = y_train.iloc[test_index]


        # model training
        model.fit(x_train_cv, y_train_cv)

        # prediction
        y_hat = model.predict(x_test_cv)

        y_hat_ = np.expm1(y_hat)
        y_test_cv = np.expm1(y_test_cv)

        # metrics
        MAPE = np.round(mean_absolute_percentage_error( y_test_cv, y_hat_ ), 2)
        MAE = np.round(mean_absolute_error(y_test_cv, y_hat_), 2)
        RMSE = np.round(np.sqrt(mean_squared_error(y_test_cv, y_hat_)), 2)
#         print(MAPE)
        # append to list
        rmse_list.append(RMSE)
        mae_list.append(MAE) 
        mape_list.append(MAPE)

    avg_mae, stf_mae = np.round( np.mean( mae_list ), 2 ), np.round( np.std( mae_list ), 2 )
    avg_mape, stf_mape = np.round( np.mean( mape_list ), 2 ), np.round( np.std( mape_list ), 2 )
    avg_rmse, stf_rmse = np.round( np.mean( rmse_list ), 2 ), np.round( np.std( rmse_list ), 2 )

    return avg_mae

def simple_model_test(model, nome):
    #model definition

    # model fit
    model.fit(X_train, y_train)

    #model predict
    y_hat = model.predict(X_test)
    y_hat_ = np.expm1(y_hat)
    
    MAPE = np.round(mean_absolute_percentage_error( y_test_, y_hat_ ), 2)
    MAE = np.round(mean_absolute_error(y_test_, y_hat_), 2)
    RMSE = np.round(np.sqrt(mean_squared_error(y_test_, y_hat_)), 2)
    
    
    print('{}\n MAE: {}\n MAPE: {}\n RMSE: {}'.format(nome, MAE, MAPE, RMSE))
    return MAE

### 1.0. Data Description

In [204]:
df1 = df_raw.copy()

# rename columns

df1.columns = ['id', 'num_fotos', 'marca', 'modelo', 'versao', 'ano_de_fabricacao',
       'ano_modelo', 'odometro', 'cambio', 'num_portas', 'tipo', 'blindado',
       'cor', 'tipo_vendedor', 'cidade_vendedor', 'estado_vendedor',
       'tipo_anuncio', 'entrega_delivery', 'troca', 'elegivel_revisao',
       'aceita_troca', 'dono_unico',
       'todas_revisoes_concessionaria',
       'ipva_pago', 'licenciado',
       'garantia_de_fabrica',
       'todas_revisoes_agenda',
       'alienado', 'preco']

#fill na

df1['num_fotos'] = df1['num_fotos'].fillna(0)

df1 = df1.drop('alienado', axis=1)

na_cols = df1.columns[-8:-1]


for col in na_cols:
    df1[col] = np.where(df1[col].isna(), 0, 1)
    

df1[df1.T.tail(8).index] = df1[df1.T.tail(8).index].astype('int64')

df1['ano_modelo'] = df1['ano_modelo'].astype('int64')
df1['num_fotos'] = df1['num_fotos'].astype('int64')

## 2.0. Data Filtering

In [205]:
import sweetviz as sv
# my_report = sv.analyze(df1, target_feat='preco')
# my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

df1 =df1.drop(['elegivel_revisao'], axis=1)

In [206]:
# plt.figure(figsize=(15,7))
# sns.barplot(data=df1[df1['cor'].isin(['Branco', 'Preto', 'Cinza', 'Prata'])], y='preco', x='estado_vendedor', hue='cor', estimator=np.median);

In [207]:
# sns.barplot(data=df1, y='preco', x='estado_vendedor', hue='num_portas', estimator=np.mean);

In [208]:
# sns.barplot(data=df1, y='preco', x='estado_vendedor', hue='ipva_pago', estimator=np.mean);

## 3.0. Data Transformation


In [209]:
num_attributes = df1.select_dtypes(exclude='object')



### 3.1. Rescaling

In [210]:
min_max_cols = ['num_fotos',  
 'ano_de_fabricacao',   
 'ano_modelo',          
 'odometro',            
 'num_portas']

mms = pp.MinMaxScaler()

for column in min_max_cols:

    df1[column] = mms.fit_transform(df1[[column]].values)
    pickle.dump(mms, open(f'../parameters/{column}_scaler.pkl', 'wb'))
    




df1['preco'] = np.log1p(df1['preco'])

### 3.2. Encoding

In [211]:
df1 = df1.drop(['cidade_vendedor', 'tipo_anuncio'], axis=1)

#get uf
df1['estado_vendedor'] = df1['estado_vendedor'].apply(lambda x: x[-3:-1])

map_cor = {'Preto':'preto', 'Branco':'branco', 'Prata':'prata', 'Cinza':'cinza', 'Dourado':'outros', 'Vermelho':'outros', 'Azul':'outros',
       'Verde':'outros'}

map_regiao={'SP':'sudeste','RS':'sul','MG':'sudeste','PR':'sul','RJ':'sudeste','MA':'nordeste','SC':'sul','AL':'nordeste','BA':'nordeste','GO':'centro_oeste','RN':'nordeste','PE':'nordeste','MT':'centro_oeste','PA':'norte','CE':'nordeste','AM':'nordeste','ES':'sudeste','RO':'norte','PB':'nordeste','TO':'norte','AC':'norte','SE':'nordeste','MS':'centro_oeste','RR':'norte','PI':'nordeste'}


map_cambio = {'Automática': 2, 'Manual' :0, 'CVT' :2, 'Automatizada': 2, 'Semi-automática': 0,
       'Automatizada DCT' : 2, 'Automática Sequencial' : 1}

# aux = df1[['estado_vendedor', 'preco']].groupby('estado_vendedor').mean().reset_index().sort_values('preco')
# aux['regiao'] = aux['estado_vendedor'].map(map_regiao)



# plt.figure(figsize=(15,7))
# sns.barplot(data=aux, x='estado_vendedor', y='preco', hue='regiao')



# target encoders
for column in ['marca',  'versao',  'estado_vendedor', 'tipo', 'modelo']:
    target = df1.groupby(column)['preco'].mean()
    df1[column] = df1[column].map(target)
    pickle.dump(target, open(f'../parameters/{column}_encode.pkl', 'wb'))


# frequency encoders
# for column in [ ]:
#     frequency = df1.groupby(column)['preco'].count() / len(df1)
#     df1[column] = df1[column].map(frequency)
# #     pickle.dump(frequency, open(f'../parameters/{column}_encode.pkl', 'wb'))

# binary
df1['blindado'] = np.where(df1['blindado']=='N', 0, 1)

#map
df1['cor'] = df1['cor'].map(map_cor)
# df1['estado_vendedor'] = df1['estado_vendedor'].map(map_regiao)
df1['cambio'] = df1['cambio'].map(map_cambio)

# one hot encoding
df1 = pd.get_dummies(df1, columns=['cor',  'tipo_vendedor'])


for column in df1.select_dtypes(['int32', 'uint8']).columns:
    df1[column] = df1[column].astype('int64')

## 4.0. Feature Selection

In [183]:
df2 = df1.drop('id', axis=1).copy()

y = df2['preco']
X = df2.drop('preco', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# # #training and test dataset for Boruta


## Model Train

### Linear Regression

In [184]:
y_test_ = np.expm1(y_test)




## Cross Validation

In [185]:
def send_model(model):
     # model fit
    model.fit(X_train[cols_selected], y_train)
    pickle.dump(model, open('../parameters/model.pkl', 'wb'))
    print('Model submited')
    return None

## Hyperparameter Tuning

### RF

In [186]:
cols_selected = ['versao', 'cambio', 'marca', 'odometro', 'tipo', 'ano_modelo',
       'ano_de_fabricacao',  'modelo']

# cols_selected = ['versao', 'cambio', 'marca', 'odometro', 'tipo', 'ano_modelo',
#        'ano_de_fabricacao']

In [191]:
# model = LGBMRegressor(learning_rate = 0.074,num_leaves = 80,max_depth = 70 ,
#                          n_estimators = 100, boosting_type='gbdt', min_child_weight = 0.001)

# send_model(model)


model = LGBMRegressor(learning_rate = 0.07,num_leaves = 90,max_depth = 80 ,
                         n_estimators = 100, boosting_type='gbdt', min_child_weight = 0.001)

cross_validation('lgbm',model, X_train[cols_selected], y_train)

24985.79

In [192]:
send_model(model)

Model submited
