In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.decomposition import PCA

import joblib , hyperopt

  from pandas.core import (


In [3]:
df1=pd.read_csv("missing_value_imputation.csv")
df=df1.copy()

In [4]:
df.drop(columns=["hdd","thickness_num","weight_num","weight", 'usb2', 'ppi_type',"everyday_use","performance","vga","multi_card_reader","quality_type","antiglare","fingerprint_sensor","ethernet","hdmi","display_port","usb3"],inplace=True)

In [7]:
df.columns

Index(['brand', 'price', 'thickness', 'screen_size', 'ppi', 'threads', 'ram',
       'touch_screen', 'cores', 'battery_capacity', 'battery_cell',
       'thunderbolt', 'backlit', 'typec', 'processor_gen', 'processor_brand',
       'processor_model', 'graphics_brand', 'graphics_capacity',
       'graphics_model', 'business', 'gaming', 'ssd', 'popularity'],
      dtype='object')

In [51]:
X=df.drop("price",axis=1)
y=df["price"]
y_transformed=np.log1p(y)

In [52]:
ohe_columns=["brand","graphics_brand","processor_brand","touch_screen","backlit","business","gaming"]
oe_columns=["thickness","typec","screen_size","processor_gen","processor_model","popularity","graphics_model"]
std=["ppi","battery_capacity","ssd","threads","ram","cores","graphics_capacity","battery_cell"]

In [53]:
all_columns=ohe_columns+oe_columns

In [54]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), std),
        ('cat', OrdinalEncoder(), oe_columns),
        ('cat1',OneHotEncoder(drop="first",sparse_output=False),ohe_columns)
    ], 
    remainder='passthrough'
)

In [55]:
transformed_data=preprocessor.fit_transform(X)

In [56]:
transformed_df = pd.DataFrame(transformed_data, columns=preprocessor.get_feature_names_out())

In [57]:
transformed_df.shape

(798, 53)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(transformed_df,y_transformed,test_size=0.2,random_state=42)

In [59]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}


In [60]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=43)
    scores = cross_val_score(model, transformed_df, y_transformed, cv=5, scoring='r2')
    
    output.append(scores.mean())
    
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [61]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [62]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.892254,9781.13963
0,linear_reg,0.886382,10440.301861
7,gradient boosting,0.899856,10743.634546
5,random forest,0.887682,10885.12584
2,ridge,0.888134,10927.682681
9,xgboost,0.900602,11660.098535
1,svr,0.868812,11843.203826
8,adaboost,0.82023,14011.566134
4,decision tree,0.786335,15434.227196
3,LASSO,0.22169,26834.82439


In [17]:
!pip install xgboost hyperopt --user



In [20]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK


In [37]:
def objective(params):
    # Convert parameters to the correct types
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])
    
    # Create the XGBoost model
    model = XGBRegressor(**params, objective='reg:squarederror')
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    preds = model.predict(X_test)
    y_pred = np.expm1(preds)
    
    # Calculate the mean squared error
    mae = mean_absolute_error(np.expm1(y_test),y_pred)
    
    # Return the loss and the status
    return {'loss': mae, 'status': STATUS_OK}


In [38]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators': hp.quniform('n_estimators', 50, 300, 1),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'reg_alpha': hp.loguniform('reg_alpha', -3, 1),
    'reg_lambda': hp.loguniform('reg_lambda', -3, 1),
    'subsample': hp.uniform('subsample', 0.6, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1)
}

In [39]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print("Best hyperparameters:", best)

100%|██████████████████████████████████████████████| 100/100 [00:23<00:00,  4.18trial/s, best loss: 10007.931689453128]
Best hyperparameters: {'colsample_bytree': 0.7031793148625255, 'gamma': 0.09157016069704034, 'learning_rate': 0.1670284960875044, 'max_depth': 6.0, 'n_estimators': 92.0, 'reg_alpha': 0.06608986272090762, 'reg_lambda': 0.3140668068210218, 'subsample': 0.6411710852008716}


In [42]:
# Convert the best hyperparameters to the correct types
best['max_depth'] = int(best['max_depth'])
best['n_estimators'] = int(best['n_estimators'])

# Create the XGBoost model with the best hyperparameters
best_model = XGBRegressor(**best, objective='reg:squarederror')

# Train the model
best_model.fit(X_train, y_train)

# Make predictions
final_preds = best_model.predict(X_test)
y_pred = np.expm1(final_preds)


# Evaluate the model
final_mse = mean_absolute_error(np.expm1(y_test), y_pred)
final_r2 = r2_score(np.expm1(y_test), y_pred)
print("Final MSE:", final_mse)
print("Final R2:", final_r2)


Final MSE: 10007.931689453128
Final R2: 0.8593794226821186


In [66]:
# K-fold cross-validation
model=LinearRegression()
kfold = KFold(n_splits=10, shuffle=True, random_state=43)
scores = cross_val_score(model, transformed_df, y_transformed, cv=5, scoring='r2')

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

y_pred = np.expm1(y_pred)

mae=mean_absolute_error(np.expm1(y_test),y_pred)
final_r2 = r2_score(np.expm1(y_test), y_pred)
print(mae)
print(final_r2)




10440.301860591426
0.882078002857849


In [67]:
def objective(params):
    # Convert parameters to the correct types
    params['n_estimators'] = int(params['n_estimators'])
    params['max_depth'] = int(params['max_depth']) if params['max_depth'] is not None else None
    params['min_samples_split'] = int(params['min_samples_split'])
    params['min_samples_leaf'] = int(params['min_samples_leaf'])

    # Create the Extra Trees model
    model = ExtraTreesRegressor(**params, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    preds = model.predict(X_test)
    preds=np.expm1(preds)
    
    # Calculate the mean squared error
    mse = mean_absolute_error(np.expm1(y_test), preds)
    
    # Return the loss and the status
    return {'loss': mse, 'status': STATUS_OK}


In [68]:
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 1),
    'max_features': hp.choice('max_features', [None, 'sqrt', 'log2']),
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 20, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 20, 1),
    'bootstrap': hp.choice('bootstrap', [False, True])
}


In [69]:
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print("Best hyperparameters:", best)


100%|███████████████████████████████████████████████| 100/100 [00:48<00:00,  2.08trial/s, best loss: 9960.488299031618]
Best hyperparameters: {'bootstrap': 0, 'max_depth': 19.0, 'max_features': 0, 'min_samples_leaf': 2.0, 'min_samples_split': 4.0, 'n_estimators': 181.0}


In [73]:
# Convert the best hyperparameters to the correct types
best['n_estimators'] = int(best['n_estimators'])
best['max_depth'] = int(best['max_depth']) if best['max_depth'] is not None else None
best['min_samples_split'] = int(best['min_samples_split'])
best['min_samples_leaf'] = int(best['min_samples_leaf'])

# Handle None type for max_features and bootstrap correctly
if best['max_features'] is not None:
    best['max_features'] = [None, 'sqrt', 'log2'][int(best['max_features'])]

best['bootstrap'] = [False, True][int(best['bootstrap'])]

# Create the Extra Trees model with the best hyperparameters
best_model = ExtraTreesRegressor(**best, random_state=42)

# Train the model
best_model.fit(X_train, y_train)

# Make predictions
final_preds = best_model.predict(X_test)
final_preds = np.expm1(final_preds)

# Evaluate the model
final_mae = mean_absolute_error(np.expm1(y_test), final_preds)
final_r2 = r2_score(np.expm1(y_test), final_preds)

print("Final MAE:", final_mae)
print("Final R²:", final_r2)



Final MAE: 9960.488299031618
Final R²: 0.8549586765675898


In [76]:
# Export the model using joblib
joblib_file = "model.pkl"
joblib.dump(best_model, joblib_file)

print(f"Model saved to {joblib_file}")


Model saved to model.pkl


In [95]:
# Load the model from the file
loaded_model = joblib.load(joblib_file)

# Use the loaded model to make predictions
loaded_preds = loaded_model.predict(X_test)
loaded_preds=np.expm1(loaded_preds)

# Evaluate the loaded model
loaded_mse = mean_absolute_error(np.expm1(y_test), loaded_preds)
loaded_r2 = r2_score(np.expm1(y_test), loaded_preds)

print("Loaded Model MSE:", loaded_mse)
print("Loaded Model R²:", loaded_r2)

Loaded Model MSE: 9960.488299031618
Loaded Model R²: 0.8549586765675898
