## **Running the Model with Label Encoders and Exporting**

In [1]:
import numpy as np 
from sklearn.preprocessing import LabelEncoder
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")

In [2]:
# import and view dataset
df = pd.read_csv('used_cars3.csv')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,Price,Year,Mileage,State,Country of Manufacture,Base Model,Features,Division
0,10995,2014,20403,KS,US,Focus,Regular,West North Central
1,11997,2014,18520,WA,US,Focus,Regular,Pacific
2,11990,2014,17738,WA,US,Focus,Regular,Pacific
3,10889,2014,29057,WA,US,Focus,Regular,Pacific
4,11995,2014,20335,ID,US,Focus,Regular,Mountain


In [3]:
le_year = LabelEncoder()
df['Year'] = le_year.fit_transform(df['Year'])
df['Year'].unique()

array([17, 16, 19, 15, 12, 18, 13, 14, 10,  9, 11,  7, 20,  8,  6,  5,  2,
        4,  3,  1, 21,  0], dtype=int64)

In [4]:
le_state = LabelEncoder()
df['State'] = le_state.fit_transform(df['State'])
df['State'].unique()

array([16, 47, 13,  3,  4, 10, 39, 43,  9, 48, 27, 26, 42, 37,  5, 31, 17,
       34, 11, 44, 15, 20, 35, 45, 19, 36, 14, 22,  8,  6, 30, 23, 40, 24,
       38, 21, 25, 32,  2, 12,  1, 28, 49, 29, 46, 33, 41, 18,  0, 50,  7])

In [5]:
le_manuf = LabelEncoder()
df['Country of Manufacture'] = le_manuf.fit_transform(df['Country of Manufacture'])
df['Country of Manufacture'].unique()

array([1, 0])

In [6]:
le_model = LabelEncoder()
df['Base Model'] = le_model.fit_transform(df['Base Model'])
df['Base Model'].unique()

array([15, 18, 12,  0,  6, 19,  5, 11, 10, 22, 21,  4, 24, 14,  9, 20, 13,
       17, 16,  3, 23,  8, 26, 25,  7,  2,  1])

In [7]:
le_feats = LabelEncoder()
df['Features'] = le_feats.fit_transform(df['Features'])
df['Features'].unique()

array([38, 40, 39, 46, 51, 27, 10, 16, 17,  5, 41, 14, 32, 55, 34, 52,  9,
       48, 42, 23, 29, 18, 50, 28, 49, 30, 53, 22, 24, 47, 12,  6, 54, 43,
       26, 45, 36, 13, 31,  4,  7, 20,  0, 11,  8,  2, 33, 44, 19, 35, 37,
        3,  1, 25, 21, 15])

In [8]:
le_divs = LabelEncoder()
df['Division'] = le_divs.fit_transform(df['Division'])
df['Division'].unique()

array([7, 5, 3, 6, 4, 8, 0, 1, 2])

In [9]:
df.head()

Unnamed: 0,Price,Year,Mileage,State,Country of Manufacture,Base Model,Features,Division
0,10995,17,20403,16,1,15,38,7
1,11997,17,18520,47,1,15,38,5
2,11990,17,17738,47,1,15,38,5
3,10889,17,29057,47,1,15,38,5
4,11995,17,20335,13,1,15,38,3


In [10]:
# import required libraries
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [11]:
# Take logarithm of 'Price'
df['log_Price'] = np.log(df['Price'])

# Splitting the dataset into training and testing sets
X = df.drop(['Price', 'log_Price'], axis=1)
y = df['log_Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
from xgboost import XGBRegressor

# Initializing the XGBoost classifier
xgb_reg = XGBRegressor(objective ='reg:squarederror', 
                       n_estimators=1000, 
                       max_depth=7, eta=0.1, 
                       subsample=0.7, colsample_bytree=0.8)

# Fitting the classifier to the training data
xgb_reg.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = xgb_reg.predict(X_test)


# Evaluating the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Displaying the evaluation metrics and the first few rows of the results DataFrame
print('The Mean Average Error(MAE) is: ', mae)
print('The Mean Square Error(MSE) is: ', mse)
print('The Root Mean Square Error(RMSE) is: ', rmse)
print('The R Square is: ', r2)

The Mean Average Error(MAE) is:  0.11916367917206468
The Mean Square Error(MSE) is:  0.028914962045794784
The Root Mean Square Error(RMSE) is:  0.17004400032284228
The R Square is:  0.9043964982346274


In [13]:
# Export trained model and label encoders
import pickle

mileage = df['Mileage']

data = {"model": xgb_reg, 
        "le_year": le_year, 
        "le_state": le_state,
        "le_manuf": le_manuf,
        "le_model": le_model,
        "le_feats": le_feats,
        "le_divs": le_divs}

with open('price_predictor.pkl', 'wb') as file:
    pickle.dump(data, file)