# Computer price Notebook

### Introduction

To put it simple, we want to predict the retail price of laptops and desktops based on their hardware specifications.

This model will help us detect if a computer is valued fairly or not.

In [None]:
import pandas as pd
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("paperxd/all-computer-prices")

print("Path to dataset files:", path)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def adjusted_r2_score(y_true, y_pred, n_features):
    r2 = r2_score(y_true, y_pred)
    adjusted_r2 = 1 - ((1 - r2) * (n_features - 1)) / ((len(y_true) - n_features - 1))
    return adjusted_r2

def print_errors(actual, pred, n_feat):
  print("\tR2: " + str(r2_score(actual, pred)))
  print("\tAdjusted R2: " + str(adjusted_r2_score(actual, pred, n_feat)))
  print("\tMSE: " + str(mean_squared_error(actual, pred)))
  print("\tMAE: " + str(mean_absolute_error(actual, pred)))

In [None]:
path

In [None]:
print(os.listdir(path))

In [None]:
csv_path = f"{path}/computer_prices_all.csv"
df = pd.read_csv(csv_path)

In [None]:
df.head(10)

# EDA

To make the EDA easier, we will use ProfileReport

In [None]:
from ydata_profiling import ProfileReport

In [None]:
computer_report = ProfileReport(df)

In [None]:
computer_report.to_file('computer_report.html')

Checking for duplicates

In [None]:
df.duplicated().sum()

Checking for Na values

In [None]:
df.isna().sum()

In [None]:
numerical_cols = df_1.select_dtypes(include=['int64', 'float64']).columns

In [None]:
sns.heatmap(df_1[numerical_cols].corr(), cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.show()

In [None]:
corr_matrix = df_1[numerical_cols].corr()
corr_series = corr_matrix.unstack()
corr_series = corr_series[corr_series.index.get_level_values(0) != corr_series.index.get_level_values(1)]
corr_series = corr_series.abs().sort_values(ascending=False)
corr_series = corr_series[~corr_series.index.duplicated(keep='first')]
corr_df = pd.DataFrame(corr_series).reset_index()
corr_df.columns = ['Feature_1', 'Feature_2', 'Correlation']

In [None]:
corr_df

In [None]:
corr_df[corr_df['Correlation'] >= 0.85]

# Feature Engineering

For the release year column, I will replace it for a years_since_release column
I will copy the original df to keep it intact 

In [None]:
df_1 = df.copy()

In [None]:
df_1['years_since_release'] = 2025 - df_1['release_year']

In [None]:
df_1.drop(columns='release_year', inplace = True)

In [None]:
df_1[['cpu_model', 'cpu_tier', 'cpu_cores', 'cpu_threads', 'cpu_base_ghz', 'cpu_boost_ghz']][df_1["cpu_model"] == 'Intel i5-11129']

After checking this example, I noticed that the cpu_model column might not be significant for our model, so we will drop this column. We can asume the same for the gpu_model column. The other cpu and gpu columns already capture the specifications.

I will also remove the general model column

In [None]:
df_1.drop(columns=['cpu_model','gpu_model'], inplace=True)
df_1.drop(columns=['model'], inplace=True)

Checking for all the categorical columns left 

In [None]:
categorical_cols = df_1.select_dtypes(include=["object", "category"]).columns
categorical_cols

The resolution can be broken down into width and height

In [None]:
df_1[['width', 'height']] = df_1['resolution'].str.split('x', expand=True).astype(int)

Now we can drop the resolution column

In [None]:
df_1.drop(columns='resolution', inplace=True)

Using these two numbers we can create two new columns, total_pixels and aspect_ratio 

In [None]:
df_1['total_pixels'] = df_1['width'] * df_1['height']
df_1['aspect_ratio'] = df_1['width'] / df_1['height']

Now we drop the width and height columns

In [None]:
df_1.drop(columns=['width', 'height'], inplace=True)

The bluetooth feature also acts as a categorical value 

In [None]:
df_1['bluetooth'].value_counts()

In [None]:
df_1['bluetooth'] = df_1['bluetooth'].astype(str)

In [None]:
df_1_enhanced = df_1.copy()

Now we add some extra features

In [None]:
# Some feature engineering
df_1_enhanced["performance_per_core"] = df_1_enhanced["cpu_base_ghz"] * df_1_enhanced["cpu_cores"]
df_1_enhanced["boost_per_core"] = df_1_enhanced["cpu_boost_ghz"] * df_1_enhanced["cpu_cores"]
df_1_enhanced["ram_per_core"] = df_1_enhanced["ram_gb"] / df_1_enhanced["cpu_cores"].replace(0, np.nan)
df_1_enhanced["storage_per_drive"] = df_1_enhanced["storage_gb"] / df_1_enhanced["storage_drive_count"].replace(0, np.nan)
df_1_enhanced["battery_efficiency"] = df_1_enhanced["battery_wh"] / df_1_enhanced["weight_kg"].replace(0, np.nan)
df_1_enhanced["ppi"] = (df_1_enhanced["total_pixels"] ** 0.5) / df_1_enhanced["display_size_in"].replace(0, np.nan)

# extras
df_1_enhanced["total_performance_tier"] = df_1_enhanced["cpu_tier"] + df_1_enhanced["gpu_tier"]
df_1_enhanced["performance_age_ratio"] = (df_1_enhanced["cpu_boost_ghz"] * df_1_enhanced["cpu_cores"]) / (1 + df_1_enhanced["years_since_release"])

# replace inf/nan if division by zero
df_1_enhanced.replace([np.inf, -np.inf], np.nan, inplace=True)
df_1_enhanced.fillna(0, inplace=True)

For the rest of these columns, we can onehot encode

In [None]:
categorical_cols = df_1.select_dtypes(include=["object", "category"]).columns

In [None]:
categorical_cols

First we create our train and test 

In [None]:
X = df_1_enhanced.drop(columns='price')

In [None]:
y = df_1_enhanced['price']

In [None]:
y.skew()

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [None]:
skew_values = X[num_cols].skew().sort_values(ascending=False)
skew_values

In [None]:
num_cols

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,)

We proceed to One Hot Encode

In [None]:
encoder = OneHotEncoder(drop='first')
encoded_train = encoder.fit_transform(X_train[categorical_cols]).toarray()
encoded_test = encoder.transform(X_test[categorical_cols]).toarray()

In [None]:
df_transformed_train = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out())
df_transformed_test = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out())

In [None]:
X_train_encoded = pd.concat([X_train.drop(columns=categorical_cols).reset_index(), df_transformed_train], axis=1).drop(columns='index')
X_test_encoded = pd.concat([X_test.drop(columns=categorical_cols).reset_index(), df_transformed_test], axis=1).drop(columns='index')

In [None]:
X_train_encoded

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = X_train_encoded.copy()
X_test_scaled = X_test_encoded.copy()

In [None]:
X_train_scaled[num_cols] = scaler.fit_transform(X_train_encoded[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test_encoded[num_cols])

# Applying some models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor  

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score


In [None]:
models = {
    'lin': {
        'model': LinearRegression(), 
        'scaled': True, 
        'params': {}
    },
    'gb': {
        'model': GradientBoostingRegressor(),
        'scaled': False,
        'params': {'n_estimators': [100, 300], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5], 'subsample': [0.8, 1.0]
        }
    },
    'dt': {
        'model': DecisionTreeRegressor(),
        'scaled': False,
        'params': {'max_depth': [7, 10, 15], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]
        }
    },
    'rf': {
        'model': RandomForestRegressor(),
        'scaled': False,
        'params': {'max_depth': [7, 10, 15], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]
        }
    },
    'xgb': {
        'model': XGBRegressor(n_jobs=-1),
        'scaled': False,
        'params': {'n_estimators': [100, 200], 'learning_rate': [0.03, 0.1, 0.3], 'max_depth': [3, 5, 7, 10], 'subsample': [0.7, 1.0], 'colsample_bytree': [0.7, 1.0]
        }
    },
    'lgbm': {
        'model': LGBMRegressor(random_state=42),
        'scaled': False,  
        'params': {'n_estimators': [300, 500], 'max_depth': [5, 10], 'learning_rate': [0.05, 0.1], 'num_leaves': [31, 63], 'subsample': [0.8, 1.0], 'colsample_bytree': [0.8, 1.0]   
        }
    }  
}

In [None]:
final_models = []

for name, details in models.items():
    gscv = GridSearchCV(details['model'], details['params'], cv=3, scoring='r2', n_jobs=-1)
    X_train_final = X_train_scaled
    if details['scaled'] == True:
        X_train_final = X_train_scaled
        X_test_final = X_test_scaled
    else:
        X_train_final = X_train_encoded
        X_test_final = X_test_encoded
    gscv.fit(X_train_final, y_train)

    final_models.append({
    'model': name,
    'train_score': gscv.score(X_train_final, y_train),
    'test_score': gscv.score(X_test_final, y_test),
    'best_score': gscv.best_score_,
    'best_params': gscv.best_params_})

In [None]:
final_models_df = pd.DataFrame(final_models)
final_models_df

Lets try to find the best hyper parameters for XGB

In [None]:
param_distributions = {
    'n_estimators': [100, 200, 400, 600],          
    'max_depth': [3, 4, 5, 6, 8, 10],              
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2], 
    'subsample': [0.6, 0.8, 1.0],                  
    'colsample_bytree': [0.6, 0.8, 1.0],           
    'gamma': [0, 0.1, 0.3, 0.5, 1],                
    'min_child_weight': [1, 3, 5, 7],              
    'reg_alpha': [0, 0.001, 0.01, 0.1, 1, 10],     
    'reg_lambda': [0.1, 1, 5, 10, 20],             
    'booster': ['gbtree'],                         
    'tree_method': ['hist'],           
    'random_state': [42]
}

In [None]:
xgb = XGBRegressor()

In [None]:
random_cv = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_distributions,
    n_iter=40,            
    scoring='r2',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [None]:
random_cv.fit(X_train_encoded, y_train)

In [None]:
print("Best parameters:", random_cv.best_params_)
print("Best cross-val R2:", random_cv.best_score_)

In [None]:
xgb = XGBRegressor(**random_cv.best_params_)

In [None]:
xgb.fit(X_train_encoded, y_train)

In [None]:
print("** XGB Regressor TRAIN **")
n_features = X_train.shape[1]
y_train_predicted_xgb = xgb.predict(X_train_encoded)
print_errors(y_train, y_train_predicted_xgb, n_features)

print("** XGB Regressor TEST **")
y_test_predicted_xgb = xgb.predict(X_test_encoded)
print_errors(y_test, y_test_predicted_xgb, n_features)

In [None]:
xgb.score(X_train_encoded, y_train)

In [None]:
print(f"\nXGB Cross-Validation R2 Score: {xgb_scores.mean():.3f} ± {xgb_scores.std():.3f}")

###  Stacking Regressor

We will use LinearRegression as our final estimator

In [None]:
estimators = [
    ('lin', LinearRegression(**final_models_df.loc[final_models_df['model'] == 'lin']['best_params'].iloc[0])),
    ('gb', GradientBoostingRegressor(**final_models_df.loc[final_models_df['model'] == 'lin']['best_params'].iloc[0])),
    ('xgb', XGBRegressor(**random_cv.best_params_)),
    ('lgbm', LGBMRegressor(**final_models_df.loc[final_models_df['model'] == 'lin']['best_params'].iloc[0]))
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stacking_model.fit(X_train_encoded, y_train)

In [None]:
y_train_predicted = stacking_model.predict(X_train_encoded)
y_test_predicted = stacking_model.predict(X_test_encoded)

In [None]:
print("** Stacking Regressor TRAIN **")
n_features = X_train.shape[1]
y_train_predicted = stacking_model.predict(X_train_encoded)
print_errors(y_train, y_train_predicted, n_features)

print("** Stacking Regressor TEST **")
y_test_predicted = stacking_model.predict(X_test_encoded)
print_errors(y_test, y_test_predicted, n_features)

### Cross-validation for the stacking model

In [None]:
stacking_scores = cross_val_score(stacking_model, X_train_encoded, y_train, cv=3, scoring='r2', n_jobs=-1)

In [None]:
print(f"\nStacking Model Cross-Validation R2 Score: {stacking_scores.mean():.3f} ± {stacking_scores.std():.3f}")

 Plot the Actual Price vs Predicted price chart

In [None]:
results_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_test_predicted
})

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Actual Price', y='Predicted Price', data=results_df, alpha=0.6)
sns.lineplot(x=[results_df['Actual Price'].min(), results_df['Actual Price'].max()],
             y=[results_df['Actual Price'].min(), results_df['Actual Price'].max()],
             color='red', lw=2)
plt.title("Actual vs. Predicted Price")
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.show()

In [None]:
With Stacking, we get a slightly better result 

# Feature Importance and Interpretability

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X_train_encoded.columns,
    'Importance': xgb.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance (XGB):")
print(feature_importance)

feature_importance_top = feature_importance.sort_values(by='Importance', ascending=False).head(16)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_top)
plt.title('XGB Feature Importance')
plt.show()

In [None]:
import shap

In [None]:
X_test_sample = X_test_encoded.sample(frac=0.1, random_state=42)
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test_sample)
plt.figure(figsize=(5, 6))
shap.summary_plot(shap_values, X_test_sample, feature_names=X_test_encoded.columns, show=False)
plt.title('SHAP Feature Importance for XGB')
plt.show()

Both graphs indicate that a laptop’s overall performance and hardware capacity significantly influence its predicted price. The total performance tier and RAM size stand out as the top contributors, meaning faster processors and more memory strongly increase value. 

Apple products and macOS also add a clear brand premium, placing those laptops consistently at higher price points. Higher-end displays (OLED or Mini-LED) and powerful GPUs further signal premium devices, while lower pixel density or basic panels correspond to more affordable models.

Secondary features, such as battery efficiency, ultrabook form factor, and larger or faster storage, add smaller but still positive effects. Conversely, older models or those with weaker performance tiers slightly reduce the predicted price. Overall

Lets now save the model

In [None]:
import joblib

In [None]:
joblib.dump(
    stacking_model,
    '/Users/joseborges/Kaggle projects/Computers/stacking_model_computers.pkl'
)

In [None]:
pred_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_test_predicted
})
pred_df.to_csv(
    '/Users/joseborges/Kaggle projects/Computers/predictions.csv',
    index=False
)

# Please upvote my notebook if you find it useful 

![](https://static.wikia.nocookie.net/smiling-friends/images/e/e6/Glep_%28SF%29.png)