In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv("Books_Data_Clean.csv")

# Analyse data
df.info()

# Pre-process data
# Drop redundant columns 
columns_to_drop = ['index', 'Book Name', 'gross sales', 'publisher revenue', 'sales rank']
df.drop(columns_to_drop, axis=1, inplace=True)
#df
# Get NaN columns
nan_columns = df.columns[df.isna().any()].tolist()
print("Columns with NaN values:", nan_columns)

# Convert `Author_Rating` to numeric if it's non-numeric
df['Author_Rating'] = pd.to_numeric(df['Author_Rating'], errors='coerce')

# Define feature columns and target column
numeric_features = ['Publishing Year', 'Author_Rating', 'Book_average_rating', 'Book_ratings_count', 'sale price']
categorical_features = ['Author', 'language_code', 'genre']
target = 'units sold'

# Split features and target
X = df[numeric_features + categorical_features]
y = df[target]

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine the pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess the data and split - 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define objective functions for each model using Bayesian Optimisation

# 1. Linear Regression
def lr_cv():
    lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])
    lr_pipeline.fit(X_train, y_train)
    y_pred = lr_pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return -rmse  # Return negative RMSE

# 2. Random Forest
def rf_cv(n_estimators, max_depth, min_samples_split):
    rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', RandomForestRegressor(
                                      n_estimators=int(n_estimators), 
                                      max_depth=int(max_depth), 
                                      min_samples_split=int(min_samples_split),
                                      random_state=42))])
    rf_pipeline.fit(X_train, y_train)
    y_pred = rf_pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return -rmse  # Return negative RMSE

# 3. XGBoost
def xgb_cv(n_estimators, max_depth, learning_rate, subsample, colsample_bytree):
    xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', xgb.XGBRegressor(
                                       n_estimators=int(n_estimators), 
                                       max_depth=int(max_depth),
                                       learning_rate=learning_rate,
                                       subsample=subsample,
                                       colsample_bytree=colsample_bytree,
                                       random_state=42,
                                       objective='reg:squarederror'))])
    xgb_pipeline.fit(X_train, y_train)
    y_pred = xgb_pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return -rmse  # Return negative RMSE

# 4. SVM
def svm_cv(C, epsilon):
    svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', SVR(C=C, epsilon=epsilon))])
    svm_pipeline.fit(X_train, y_train)
    y_pred = svm_pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return -rmse  # Return negative RMSE

# Bayesian Optimisation parameters for each model

# Random Forest
param_bounds_rf = {
    'n_estimators': (50, 200),
    'max_depth': (5, 20),
    'min_samples_split': (2, 10)
}

# XGBoost
param_bounds_xgb = {
    'n_estimators': (50, 200),
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0)
}

# SVM
param_bounds_svm = {
    'C': (0.1, 10),
    'epsilon': (0.01, 1.0)
}

# Initialise Bayesian Optimiser for each model and run optimisation (Apart from Linear Regression which has no hyperparrameters to tune.)

# Random Forest
optimizer_rf = BayesianOptimization(f=rf_cv, pbounds=param_bounds_rf, random_state=42, verbose=2)
optimizer_rf.maximize(init_points=5, n_iter=20)
best_params_rf = optimizer_rf.max['params']

# XGBoost
optimizer_xgb = BayesianOptimization(f=xgb_cv, pbounds=param_bounds_xgb, random_state=42, verbose=2)
optimizer_xgb.maximize(init_points=5, n_iter=20)
best_params_xgb = optimizer_xgb.max['params']

# SVM
optimizer_svm = BayesianOptimization(f=svm_cv, pbounds=param_bounds_svm, random_state=42, verbose=2)
optimizer_svm.maximize(init_points=5, n_iter=20)
best_params_svm = optimizer_svm.max['params']

# Evaluate each model using the best hyperparameters

# 1. Linear Regression
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
lr_r2 = r2_score(y_test, y_pred_lr)

# 2. Random Forest
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor(
                                  n_estimators=int(best_params_rf['n_estimators']), 
                                  max_depth=int(best_params_rf['max_depth']), 
                                  min_samples_split=int(best_params_rf['min_samples_split']),
                                  random_state=42))])
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

# 3. XGBoost
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', xgb.XGBRegressor(
                                   n_estimators=int(best_params_xgb['n_estimators']), 
                                   max_depth=int(best_params_xgb['max_depth']),
                                   learning_rate=best_params_xgb['learning_rate'],
                                   subsample=best_params_xgb['subsample'],
                                   colsample_bytree=best_params_xgb['colsample_bytree'],
                                   random_state=42,
                                   objective='reg:squarederror'))])
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
xgb_r2 = r2_score(y_test, y_pred_xgb)

# 4. Support Vector Regressor (SVM)
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', SVR(C=best_params_svm['C'], epsilon=best_params_svm['epsilon']))])
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
svm_rmse = np.sqrt(mean_squared_error(y_test, y_pred_svm))
svm_r2 = r2_score(y_test, y_pred_svm)

# Print the results with RMSE and R² score
print(f"Linear Regression RMSE: {lr_rmse:.4f}, R²: {lr_r2:.4f}")
print(f"Random Forest RMSE: {rf_rmse:.4f}, R²: {rf_r2:.4f}")
print(f"XGBoost RMSE: {xgb_rmse:.4f}, R²: {xgb_r2:.4f}")
print(f"SVM RMSE: {svm_rmse:.4f}, R²: {svm_r2:.4f}")

# Choose the best model based on RMSE
best_model = min([("Linear Regression", lr_rmse, lr_r2), 
                  ("Random Forest", rf_rmse, rf_r2), 
                  ("XGBoost", xgb_rmse, xgb_r2), 
                  ("SVM", svm_rmse, svm_r2)], key=lambda x: x[1])

print(f"\nBest Model: {best_model[0]} with RMSE: {best_model[1]:.4f} and R²: {best_model[2]:.4f}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                1070 non-null   int64  
 1   Publishing Year      1069 non-null   float64
 2   Book Name            1047 non-null   object 
 3   Author               1070 non-null   object 
 4   language_code        1017 non-null   object 
 5   Author_Rating        1070 non-null   object 
 6   Book_average_rating  1070 non-null   float64
 7   Book_ratings_count   1070 non-null   int64  
 8   genre                1070 non-null   object 
 9   gross sales          1070 non-null   float64
 10  publisher revenue    1070 non-null   float64
 11  sale price           1070 non-null   float64
 12  sales rank           1070 non-null   int64  
 13  Publisher            1070 non-null   object 
 14  units sold           1070 non-null   int64  
dtypes: float64(5), int64(4), object(6)
mem