In [None]:
import pandas as pd
import numpy as np

In [None]:
import re

df = pd.read_csv("laptop_price_php.csv")

def calculate_ppi(x_res, y_res, inches):
    if inches == 0:
        return np.nan
    return (((x_res ** 2 + y_res ** 2) ** 0.5) / inches)

def convert_to_gb(memory):
    storage_size = 0
    storage_types = []

    for part in memory.split(" + "):
        size_match = re.search(r'(\d+)(GB|TB)', part)
        if size_match:
            size, unit = int(size_match.group(1)), size_match.group(2)
            size_gb = size * 1024 if unit == "TB" else size
            storage_size += size_gb

        if "SSD" in part:
            storage_types.append("SSD")
        elif "HDD" in part:
            storage_types.append("HDD")
        elif "Flash" in part:
            storage_types.append("Flash Storage")
        elif "Hybrid" in part:
            storage_types.append("Hybrid")

    return storage_size, " + ".join(storage_types)

def categorize_gpu_brand(row):
    brand = row["Gpu_Brand"].lower()
    model = row["Gpu_Model"].lower()

    if "nvidia" in brand:
        return "Dedicated"
    elif "intel" in brand:
        return "Integrated"
    elif "amd" in brand:
        if any(x in model for x in ["radeon rx", "firepro", "vega", "pro"]):
            return "Dedicated"
        else:
            return "Integrated"
    else:
        return "Integrated"

def classify_cpu_series(model):
    model = model.lower()
    if "i9" in model or "ryzen 9" in model or "xeon" in model:
        return "High-End"
    elif "i7" in model or "ryzen 7" in model:
        return "Mid-Range"
    else:
        return "Entry-Level"

def categorize_os(os):
    os = os.lower()
    if "windows" in os:
        return "Windows"
    elif "mac" in os:
        return "MacOS"
    elif "linux" in os or "chrome" in os:
        return "Linux/Chrome"
    else:
        return "Others"

def categorize_storage_type(storage):
    storage = storage.lower()

    if "ssd" in storage or "flash" in storage or "hybrid" in storage:
        return "SSD-Based"
    elif "hdd" in storage and "+" not in storage:
        return "HDD-Based"
    elif "ssd" in storage and "hdd" in storage:
        return "Dual Storage"

type_mapping = {
    "notebook": "Budget",
    "netbook": "Budget",
    "ultrabook": "Premium",
    "2 in 1 convertible": "Premium",
    "gaming": "Performance",
    "workstation": "Performance"
}

brand_counts = df["Company"].value_counts()
df["Company_Freq"] = df["Company"].map(brand_counts)
df["Company_LogFreq"] = np.log1p(df["Company_Freq"])

df["TypeName"] = df["TypeName"].str.strip().str.lower()
df["TypeName_Category"] = df["TypeName"].map(type_mapping).fillna("Other")
df = pd.get_dummies(df, columns=["TypeName_Category"], drop_first=False)

df["X_res"] = df["ScreenResolution"].str.extract(r'(\d+)x')[0].astype(float)
df["Y_res"] = df["ScreenResolution"].str.extract(r'x(\d+)')[0].astype(float)
df["PPI"] = df.apply(lambda row: calculate_ppi(row["X_res"], row["Y_res"], row["Inches"]), axis=1)

df["Cpu"] = df["Cpu"].apply(classify_cpu_series)
df = pd.get_dummies(df, columns=["Cpu"], drop_first=False)

df["Ram"] = df["Ram"].str.replace("GB", "").astype(int)
df["Weight"] = df["Weight"].str.replace("kg", "").astype(float)
df.rename(columns={"Ram": "Ram_GB", "Weight": "Weight_Kg"}, inplace=True)

df[["Storage_Size_GB", "Storage_Type"]] = df["Memory"].apply(lambda x: pd.Series(convert_to_gb(x)))
df["Storage_Category"] = df["Storage_Type"].apply(categorize_storage_type)
df = pd.get_dummies(df, columns=["Storage_Category"], drop_first=False)

df["Gpu_Brand"] = df["Gpu"].apply(lambda x: x.split()[0])
df["Gpu_Model"] = df["Gpu"].apply(lambda x: " ".join(x.split()[1:]))
df["Gpu_group"] = df.apply(categorize_gpu_brand, axis=1)
df = pd.get_dummies(df, columns=["Gpu_group"], drop_first=False)

df["OpSys_Category"] = df["OpSys"].apply(categorize_os)
df = pd.get_dummies(df, columns=["OpSys_Category"], drop_first=False)

df.drop(columns=['laptop_ID', 'Company', 'Product', 'TypeName', 'OpSys', 'ScreenResolution', "X_res", "Y_res", 'Gpu_Model', 'Gpu_Brand','Gpu', 'Memory', 'Storage_Type'], inplace=True)
df = df[[col for col in df.columns if col != "Price_php"] + ["Price_php"]]


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

def preprocess(dframe):
    test_df = dframe.copy(deep=True)
    scaler = StandardScaler()

    numerical_features = ["PPI", "Ram_GB", "Storage_Size_GB", "Company_LogFreq"]
    test_df[numerical_features] = scaler.fit_transform(test_df[numerical_features])

    median_weight = test_df["Weight_Kg"].median()
    test_df["Weight_Kg"] = test_df["Weight_Kg"].fillna(median_weight)
    test_df["Weight_Category"] = pd.cut(
        test_df["Weight_Kg"],
        bins=[0, 1.5, 2.0, 2.5, 3.0, 5.0],
        labels=[1, 2, 3, 4, 5]
    ).astype(float)
    test_df.drop(columns=["Weight_Kg"], inplace=True)

    cpu_benchmarks = {"Entry-Level": 3000, "Mid-Range": 9000, "High-End": 18000}
    test_df["Cpu_Score"] = (
        test_df["Cpu_Entry-Level"] * 3000 +
        test_df["Cpu_Mid-Range"] * 9000 +
        test_df["Cpu_High-End"] * 18000
    )

    gpu_benchmarks = {"Integrated": 2000, "Dedicated": 15000}
    test_df["Gpu_Score"] = (
        test_df["Gpu_group_Integrated"] * 2000 +
        test_df["Gpu_group_Dedicated"] * 15000
    )

    test_df["Storage_Impact"] = test_df["Storage_Size_GB"] * (test_df["Storage_Category_SSD-Based"] * 1.5 + 1)

    test_df["Performance_Index"] = (
        (test_df["Cpu_Score"] * 5.0) +
        (test_df["Gpu_Score"] * 5.0) +
        (test_df["Ram_GB"] * 2.0) +
        test_df["Storage_Impact"]
    )

    test_df["Performance_to_Weight"] = test_df["Performance_Index"] / (test_df["Weight_Category"] + 1)

    test_df["Display_Quality"] = test_df["PPI"] * test_df["Inches"]

    return test_df

dcdone = preprocess(df)

In [None]:
df.to_csv("cleaned_data.csv", index=False)

In [None]:
shuffled =  dcdone.sample(frac=1)
trainsize = int(len(shuffled) * 0.70)
validsize = int(len(shuffled) * 0.15)

df_train = shuffled.iloc[:trainsize, :]
df_valid = shuffled.iloc[trainsize:(trainsize+validsize), :]
df_test = shuffled.iloc[(trainsize+validsize):, :]

In [None]:
df_train.to_csv("train_data.csv", index=False)
df_valid.to_csv("valid_data.csv", index=False)
df_test.to_csv("test_data.csv", index=False)

In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_train, df_valid, df_test = dcdone.copy(), dcdone.copy(), dcdone.copy()
X_train, y_train = df_train.drop(columns=["Price_php"]), df_train["Price_php"]
X_valid, y_valid = df_valid.drop(columns=["Price_php"]), df_valid["Price_php"]
X_test, y_test = df_test.drop(columns=["Price_php"]), df_test["Price_php"]

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.03,
    "max_depth": 5,
    "min_child_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "gamma": 0.5,
    "reg_alpha": 3.0,
    "reg_lambda": 4.0,
}

evals_result = {}

xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    early_stopping_rounds=50,
    evals_result=evals_result,
    verbose_eval=True
)

y_pred = xgb_model.predict(dtest)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

results_df = pd.DataFrame({"Actual Price": y_test, "Predicted Price": y_pred})
results_df["Error"] = results_df["Actual Price"] - results_df["Predicted Price"]
results_df["Absolute Error"] = results_df["Error"].abs()

print(f"\nModel Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"RÂ² Score: {r2:.4f}")

print("\nSummary Statistics:")
print("Actual Price:", results_df["Actual Price"].describe())
print("\nPredicted Price:", results_df["Predicted Price"].describe())
print("\nAbsolute Error:", results_df["Absolute Error"].describe())

plt.figure(figsize=(10,6))
xgb.plot_importance(xgb_model, max_num_features=10)
plt.title("Feature Importance")
plt.show()

plt.figure(figsize=(10,6))
plt.plot(evals_result["train"]["rmse"], label="Train RMSE", linestyle="dashed", linewidth=2, marker="o", markersize=4, color="blue")
plt.plot(evals_result["valid"]["rmse"], label="Validation RMSE", linewidth=2, marker="x", markersize=4, color="orange")
plt.xlabel("Number of Boosting Rounds")
plt.ylabel("RMSE")
plt.title("XGBoost Learning Curve")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10,6))
sns.scatterplot(x=results_df["Actual Price"], y=results_df["Predicted Price"], alpha=0.6)
plt.plot([0, results_df["Actual Price"].max()], [0, results_df["Actual Price"].max()], color="red", linestyle="dashed")
plt.xlabel("Actual Price (PHP)")
plt.ylabel("Predicted Price (PHP)")
plt.title("Actual vs. Predicted Laptop Prices")
plt.show()

plt.figure(figsize=(10, 5))
sns.histplot(results_df["Error"], bins=50, kde=True)
plt.axvline(0, color="red", linestyle="dashed")
plt.xlabel("Prediction Error (PHP)")
plt.ylabel("Frequency")
plt.title("Distribution of Prediction Errors")
plt.show()
