In [43]:
import pandas as pd

In [44]:
usedcars_df = pd.read_parquet(r'data/transformed_data.parquet')

In [45]:
from sklearn.model_selection import train_test_split

# Split the data into train (60%) and temporary set (40%)
train_df, temp_df = train_test_split(usedcars_df, test_size=0.4, random_state=1)

# Split the temporary set into validation (50%) and test (50%) sets
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=1)

In [46]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

y_train = train_df.price_in_euro.values
y_valid = valid_df.price_in_euro.values
y_test = test_df.price_in_euro.values

full_train_df = train_df.copy()
full_valid_df = valid_df.copy()
full_test_df = test_df.copy()

del train_df['price_in_euro']
del valid_df['price_in_euro']
del test_df['price_in_euro']

In [47]:
categorical_columns = ["brand", "model", "color", "registration_date", "year", "transmission_type", "fuel_type"]
numerical_columns = ["power_kw", "power_ps", "fuel_consumption_l_100km", "mileage_in_km"]

In [48]:
print(train_df[categorical_columns + numerical_columns].shape, valid_df[categorical_columns + numerical_columns].shape, test_df[categorical_columns + numerical_columns].shape)

(132719, 11) (44240, 11) (44240, 11)


In [49]:
dv = DictVectorizer(sparse=False)


train_dict = train_df[categorical_columns + numerical_columns].to_dict(orient='records')

valid_dict = valid_df[categorical_columns + numerical_columns].to_dict(orient='records')

test_dict = test_df[categorical_columns + numerical_columns].to_dict(orient='records')

X_train = dv.fit_transform(train_dict)
X_valid = dv.transform(valid_dict)
X_test = dv.transform(test_dict)

In [50]:
# from sklearn.linear_model import LogisticRegression

# model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)

# model.fit(X_train[:50000], y_train[:50000])

In [51]:
# import xgboost as xgb
# from sklearn.model_selection import GridSearchCV, train_test_split

# # Assuming your data is already prepared and split into X_train, X_valid, y_train, y_valid

# # Define the parameter grid
# param_grid = {
#     'colsample_bytree': [0.3, 0.5, 0.7],
#     'learning_rate': [0.05, 0.1, 0.15],
#     'max_depth': [3, 5, 7],
#     'alpha': [5, 10, 15],
#     'n_estimators': [10]
# }

# # Initialize the XGBoost Regressor
# xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# # Set up GridSearchCV
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
#                            scoring='neg_mean_squared_error', cv=3, verbose=1)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best performing model
# best_model = grid_search.best_estimator_

# # Predictions on the validation set
# y_pred = best_model.predict(X_valid)

# # Calculate percentage differences
# percentage_differences = [(abs(a - p) / a) * 100 if a != 0 else 0 for a, p in zip(y_valid, y_pred)]

# # Compute average precision
# average_precision = sum(percentage_differences) / len(percentage_differences)
# print(f"Average Precision: {average_precision:.2f}%")

# # You can also retrieve the best parameters like this
# print("Best parameters found: ", grid_search.best_params_)


In [52]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', 
                         colsample_bytree = 0.3, 
                         learning_rate = 0.1,
                         max_depth = 5, 
                         alpha = 10, 
                         n_estimators = 10)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_valid)
percentage_differences = []

for actual, predicted in zip(y_valid, y_pred):
    if actual != 0:  # Avoid division by zero
        difference = abs(actual - predicted) / actual
        percentage_differences.append(difference * 100)
    else:
        # Handle the case where the actual value is 0 if needed
        percentage_differences.append(0)  # or some other logic

average_precision = sum(percentage_differences) / len(percentage_differences)
print(f"Average Precision: {average_precision:.2f}%")

Average Precision: 73.41%


In [53]:
import pickle

# Save the model to a file
with open(r'model/best_xgboost_model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)
