In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
numerical_cols = ['year', 'odometer', 'lat', 'long']
categoricalOrdinal_cols = ['model']
categoricalHot_cols = ['manufacturer','fuel','drive','type']

In [None]:
cols_to_keep = numerical_cols+categoricalOrdinal_cols+categoricalHot_cols+['price']
df = pd.read_csv('csvs/vehicles.csv', usecols=cols_to_keep)

In [None]:
df = df[ (df['price'] > 500) & (df['price'] < 100000) ]

In [None]:
y=df['price']


In [None]:
X=df[numerical_cols+categoricalHot_cols+categoricalOrdinal_cols]

In [None]:
imputer = SimpleImputer()

In [None]:
hot_encoder = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('hot_encoder', OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [None]:
ordinal_encoder = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

In [None]:
preprocessor = ColumnTransformer([('numerical',imputer,numerical_cols),
                                  ('ordinal',ordinal_encoder,categoricalOrdinal_cols),
                                  ('hot', hot_encoder,categoricalHot_cols)
                                  ])

In [None]:
final_model = XGBRegressor(n_jobs=-1,learning_rate=0.03,n_estimators=5000,early_stopping_rounds=50,max_depth=12,random_state=1,min_child_weight=3,subsample=0.7,colsample_bytree=0.7)

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,random_state=1)
#y_train=np.log1p(y_train)
#y_valid=np.log1p(y_valid)

In [None]:
preprocessor.fit(X_train)
X_valid=preprocessor.transform(X_valid)
X_train=preprocessor.transform(X_train)

In [None]:
final_model.fit(X_train,y_train,verbose=False,eval_set=[(X_valid,y_valid)])

In [None]:
predictions = final_model.predict(X_valid)
mae=mean_absolute_error(predictions,y_valid)
train_preds=final_model.predict(X_train)
mae_train=mean_absolute_error(train_preds,y_train)
print(f'mae valid: {mae}')
print(f'mae train: {mae_train}')
mape=mean_absolute_percentage_error(predictions,y_valid)
print(mape)


In [None]:
df['price'].mean()