In [99]:
import pandas as pd 
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import torch
from torch.utils.data import Dataset, DataLoader, Subset

In [100]:
df = pd.read_csv('laptop_clean_dataset.csv')
print(df.columns)
print(df.dtypes)

Index(['Company', 'TypeName', 'Inches', 'Weight', 'screen_resolution',
       'display_type', 'processor_speed', 'processor_brand', 'processor_type',
       'ssd_storage', 'hdd_storage', 'flash_storage', 'ram', 'os', 'Price'],
      dtype='object')
Company               object
TypeName              object
Inches               float64
Weight                object
screen_resolution     object
display_type          object
processor_speed      float64
processor_brand       object
processor_type        object
ssd_storage            int64
hdd_storage           object
flash_storage          int64
ram                    int64
os                    object
Price                float64
dtype: object


In [101]:
print(df.head())

  Company   TypeName  Inches Weight screen_resolution display_type  \
0   Apple  Ultrabook    13.3   1.37         2560x1600      Quad HD   
1   Apple  Ultrabook    13.3   1.34          1440x900           HD   
2      HP   Notebook    15.6   1.86         1920x1080      Full HD   
3   Apple  Ultrabook    15.4   1.83         2880x1800      Quad HD   
4   Apple  Ultrabook    13.3   1.37         2560x1600      Quad HD   

   processor_speed processor_brand  processor_type  ssd_storage hdd_storage  \
0              2.3           Intel         Core i5          128           0   
1              1.8           Intel         Core i5            0           0   
2              2.5           Intel   Core i5 7200U          256           0   
3              2.7           Intel         Core i7          512           0   
4              3.1           Intel         Core i5          256           0   

   flash_storage  ram     os        Price  
0              0    8  macos   71378.6832  
1            128

Weight and hdd storage have to converted to float data type. And one hot encoding has to performed on processor_type, typename, display_type, processor_brand, ram etc.

In [102]:
df['Weight']= pd.to_numeric(df['Weight'], errors='coerce').astype('float64')
df['Weight']=df['Weight'].fillna(1.18)

In [103]:
df['hdd_storage'].unique()

array(['0', '500', '1TB', '2TB', '1TB 1TB', '32', '128', '0TB'],
      dtype=object)

In [104]:
import re

def convert_hdd_to_gb(val):
    if pd.isna(val):  # handle NaN
        return 0
    # convert TB to GB
    if 'TB' in val:
        nums = re.findall(r'(\d+)TB', val)
        return sum(int(n)*1024 for n in nums)  # multiple TBs
    # Handle GB
    elif 'GB' in val:
        nums = re.findall(r'(\d+)GB', val)
        return sum(int(n) for n in nums)
    elif val.isdigit():
        return int(val)
    return 0

df['hdd_storage'] = df['hdd_storage'].astype(str).apply(convert_hdd_to_gb)


In [105]:
print(df['hdd_storage'].dtype)

int64


In [106]:

df['display_type'].unique()

array(['Quad HD', 'HD', 'Full HD', '4K'], dtype=object)

In [107]:

df['processor_brand'].unique()
#df['screen_resolution'].unique()

array(['Intel', 'AMD', 'Samsung'], dtype=object)

In [108]:
df['processor_type'].unique()

array([' Core i5', ' Core i5 7200U', ' Core i7', ' A9-Series 9420',
       ' Core i7 8550U', ' Core i5 8250U', ' Core i3 6006U', ' Core M m3',
       ' Core i7 7500U', ' Core i3 7100U', ' Core i5 7300HQ',
       ' E-Series E2-9000e', ' Core i7 8650U', ' Atom x5-Z8300',
       ' E-Series E2-6110', ' A6-Series 9220', ' Celeron Dual Core N3350',
       ' Core i3 7130U', ' Core i7 7700HQ', ' Ryzen 1700',
       ' Pentium Quad Core N4200', ' Celeron Dual Core N3060',
       ' FX 9830P', ' Core i7 7560U', ' E-Series 6110', ' Core i5 6200U',
       ' Core M 6Y75', ' Core i5 7500U', ' Core i7 6920HQ',
       ' Core i5 7Y54', ' Core i7 7820HK', ' Xeon E3-1505M V6',
       ' Core i7 6500U', ' E-Series 9000e', ' A10-Series A10-9620P',
       ' A6-Series A6-9220', ' Core i7 6600U', ' Celeron Dual Core 3205U',
       ' Core i7 7820HQ', ' A10-Series 9600P', ' Core i7 7600U',
       ' A8-Series 7410', ' Celeron Dual Core 3855U',
       ' Pentium Quad Core N3710', ' A12-Series 9720P', ' Core i5 7300U'

In [109]:
df['Company'].unique()

array(['Apple', 'HP', 'Acer', 'Asus', 'Dell', 'Lenovo', 'Chuwi', 'MSI',
       'Microsoft', 'Toshiba', 'Huawei', 'Xiaomi', 'Razer', 'Mediacom',
       'Samsung', 'Google', 'Fujitsu', 'LG'], dtype=object)

In [110]:
df['TypeName'].unique()

array(['Ultrabook', 'Notebook', 'Gaming', '2 in 1 Convertible',
       'Workstation'], dtype=object)

In [111]:
categorical_cols = ['Company','TypeName','display_type','processor_brand','os']
numeric_cols = ['Inches','Weight','processor_speed','ssd_storage','hdd_storage','flash_storage','ram','Price']


Y = df['Price']
X = pd.get_dummies(df, columns=categorical_cols, dtype=int)
X.drop(columns=['Price'], inplace=True)
print(X.shape)
print(X)
print(X.shape, ' ', Y.shape)

(1243, 45)
      Inches  Weight screen_resolution  processor_speed  \
0       13.3    1.37         2560x1600              2.3   
1       13.3    1.34          1440x900              1.8   
2       15.6    1.86         1920x1080              2.5   
3       15.4    1.83         2880x1800              2.7   
4       13.3    1.37         2560x1600              3.1   
...      ...     ...               ...              ...   
1238    15.6    2.20          1366x768              2.5   
1239    14.0    1.80         1920x1080              2.5   
1240    13.3    1.30         3200x1800              2.5   
1241    14.0    1.50          1366x768              1.6   
1242    15.6    2.19          1366x768              2.5   

                processor_type  ssd_storage  hdd_storage  flash_storage  ram  \
0                      Core i5          128            0              0    8   
1                      Core i5            0            0            128    8   
2                Core i5 7200U          

In [112]:
def resolution_to_pixels(res):
    if pd.isna(res): return 0
    try:
        w, h = res.split('x')
        return int(w) * int(h)
    except:
        return 0

df['total_pixels'] = df['screen_resolution'].apply(resolution_to_pixels)
df['total_pixels'].unique()

array([4096000, 1296000, 2073600, 5184000, 1049088, 3317760, 5760000,
       3393024, 8294400, 3110400, 1440000, 3686400, 4990464, 3840000,
       2304000])

In [113]:
sr = df['screen_resolution']
X.drop(columns=['screen_resolution'], inplace=True)
X = pd.concat([X, df['total_pixels']], axis=1)

In [114]:
print(X.columns)
print(X['total_pixels'].head())

Index(['Inches', 'Weight', 'processor_speed', 'processor_type', 'ssd_storage',
       'hdd_storage', 'flash_storage', 'ram', 'Company_Acer', 'Company_Apple',
       'Company_Asus', 'Company_Chuwi', 'Company_Dell', 'Company_Fujitsu',
       'Company_Google', 'Company_HP', 'Company_Huawei', 'Company_LG',
       'Company_Lenovo', 'Company_MSI', 'Company_Mediacom',
       'Company_Microsoft', 'Company_Razer', 'Company_Samsung',
       'Company_Toshiba', 'Company_Xiaomi', 'TypeName_2 in 1 Convertible',
       'TypeName_Gaming', 'TypeName_Notebook', 'TypeName_Ultrabook',
       'TypeName_Workstation', 'display_type_4K', 'display_type_Full HD',
       'display_type_HD', 'display_type_Quad HD', 'processor_brand_AMD',
       'processor_brand_Intel', 'processor_brand_Samsung', 'os_chrome',
       'os_linux', 'os_mac', 'os_macos', 'os_no', 'os_windows',
       'total_pixels'],
      dtype='object')
0    4096000
1    1296000
2    2073600
3    5184000
4    4096000
Name: total_pixels, dtype: int64


In [115]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

#processor_type -> integer indices (reserve 0 for unknown)

proc_series = df['processor_type'].fillna('Unknown').astype(str)
unique_procs = proc_series.unique().tolist()  
#get a list of all unique processor types

proc2idx = {p: i+1 for i, p in enumerate(unique_procs)}   # index 0 = unknown
#Build a dictionary that maps each unique processor type to an integer ID starting from 1

df['proc_idx'] = proc_series.map(lambda x: proc2idx.get(x, 0)).astype(int)

num_proc_categories = len(proc2idx) + 1  # +1 for index 0

In [116]:
X.drop(columns=['processor_type'], inplace=True)

In [117]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [118]:
class LaptopDataset(Dataset):
    def __init__(self, numeric_array, proc_idx_array, y_array):
        self.numeric = torch.from_numpy(numeric_array).float()
        self.proc_idx = torch.from_numpy(proc_idx_array).long()
        self.y = torch.from_numpy(y_array).float()
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.numeric[idx], self.proc_idx[idx], self.y[idx]


#dataset is a protocol (like an interface). I have to implement _len_ and _getitem_
#self.numeric converts numpy array into a pytorch FloatTensor 

In [119]:
class TabularEmbeddingModel(nn.Module):
    def __init__(self, num_proc_categories, emb_dim, num_numeric, hidden_layers=[128,64], dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(num_proc_categories, emb_dim, padding_idx=0)
        input_dim = emb_dim + num_numeric
        layers = []
        in_dim = input_dim
        for h in hidden_layers:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = h
        layers.append(nn.Linear(in_dim, 1))
        self.net = nn.Sequential(*layers)
    def forward(self, numeric, proc_idx):
        emb = self.embedding(proc_idx)
        x = torch.cat([numeric, emb], dim=1)
        return self.net(x)


In [120]:
# K-Fold parameters
k = 8
kf = KFold(n_splits=k, shuffle=True, random_state=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [121]:
# Convert DataFrame and columns to numpy arrays (safe copies)
X_values = X.values.astype(np.float32)           # shape (N, num_features)
proc_idx_array = df['proc_idx'].values.astype(np.int64)  # (N,)
y_array = df['Price'].values.astype(np.float32).reshape(-1, 1)  # (N,1)

In [122]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for X_num, proc_idx_batch, y_batch in loader:
        X_num, proc_idx_batch, y_batch = X_num.to(device), proc_idx_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_num, proc_idx_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * X_num.size(0)
    return total_loss / len(loader.dataset)


In [123]:
def evaluate_model(model, val_dl, loss_fn, device):
    model.eval()
    total_loss = 0
    preds_all, y_all = [], []
    with torch.no_grad():
        for numeric, proc_idx, y in val_dl:
            numeric, proc_idx, y = numeric.to(device), proc_idx.to(device), y.to(device)
            preds = model(numeric, proc_idx).squeeze(1)
            loss = loss_fn(preds, y)
            total_loss += loss.item() * len(y)

            preds_all.extend(preds.cpu().numpy())
            y_all.extend(y.cpu().numpy())

    avg_loss = total_loss / len(val_dl.dataset)
    rmse = np.sqrt(avg_loss)
    return rmse, np.array(preds_all), np.array(y_all)

In [124]:
from torch.utils.data import Subset


fold_rmse_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n===== Fold {fold+1}/{k} =====")

    train_subset = LaptopDataset(X_train.iloc[train_idx].to_numpy(),
                                 proc_train[train_idx],
                                 Y_train[train_idx])
    val_subset = LaptopDataset(X_train.iloc[val_idx].to_numpy(),
                               proc_train[val_idx],
                               Y_train[val_idx])

    train_dl = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_dl = DataLoader(val_subset, batch_size=32, shuffle=False)

    # Fresh model per fold
    model = TabularEmbeddingModel(num_proc_categories=num_proc_categories,
                                  emb_dim=8,   # embedding dimension choice
                                  num_numeric=X_train.shape[1],
                                  hidden_layers=[128, 64],
                                  dropout=0.2).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    best_val = float("inf")
    patience, no_improve = 20, 0
    best_state = None

    for epoch in range(500):
        train_loss = train_one_epoch(model, train_dl, optimizer, loss_fn, device)
        val_rmse, _, _ = evaluate_model(model, val_dl, loss_fn, device)

        if val_rmse < best_val - 1e-4:
            best_val = val_rmse
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    # Load best weights
    if best_state is not None:
        model.load_state_dict(best_state)

    print(f"Fold {fold+1} RMSE: {best_val:.4f}")
    fold_rmse_scores.append(best_val)

print("\n==== Final Cross-Validation Results ====")
for i, score in enumerate(fold_rmse_scores, 1):
    print(f"Fold {i}: {score:.4f}")
print(f"Average RMSE: {np.mean(fold_rmse_scores):.4f}")



===== Fold 1/8 =====


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Fold 1 RMSE: 53033.8941

===== Fold 2/8 =====
Fold 2 RMSE: 47121.8141

===== Fold 3/8 =====
Fold 3 RMSE: 48899.9691

===== Fold 4/8 =====
Fold 4 RMSE: 52523.3511

===== Fold 5/8 =====
Fold 5 RMSE: 51561.5905

===== Fold 6/8 =====
Fold 6 RMSE: 42753.4144

===== Fold 7/8 =====
Fold 7 RMSE: 46591.1017

===== Fold 8/8 =====
Fold 8 RMSE: 42732.5182

==== Final Cross-Validation Results ====
Fold 1: 53033.8941
Fold 2: 47121.8141
Fold 3: 48899.9691
Fold 4: 52523.3511
Fold 5: 51561.5905
Fold 6: 42753.4144
Fold 7: 46591.1017
Fold 8: 42732.5182
Average RMSE: 48152.2067
