In [385]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression
import re

### Dataset:
IRON WOLF. (2024). *Laptop Price - dataset* [Data set]. 
Kaggle. https://www.kaggle.com/datasets/ironwolf437/laptop-price-dataset?resource=download


In [386]:
dataset_cols = [
    "company",
    "product",
    "type",
    "inches",
    "screen_res",
    "cpu_company",
    "cpu_type",
    "cpu_speed",
    "ram",
    "memory",
    "gpu_company",
    "gpu_type",
    "weight",
    "price"
]
df = pd.read_csv("data/laptop_price - dataset.csv").drop(["OpSys"], axis=1)

In [387]:
df.columns = dataset_cols

In [388]:
df.head()

Unnamed: 0,company,product,type,inches,screen_res,cpu_company,cpu_type,cpu_speed,ram,memory,gpu_company,gpu_type,weight,price
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,2.3,8,128GB SSD,Intel,Iris Plus Graphics 640,1.37,1339.69
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel,Core i5,1.8,8,128GB Flash Storage,Intel,HD Graphics 6000,1.34,898.94
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel,Core i5 7200U,2.5,8,256GB SSD,Intel,HD Graphics 620,1.86,575.0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel,Core i7,2.7,16,512GB SSD,AMD,Radeon Pro 455,1.83,2537.45
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,3.1,8,256GB SSD,Intel,Iris Plus Graphics 650,1.37,1803.6


In [389]:
# remove unpredictory data
df = df.drop(['product'], axis=1)

### Clean Data and preprocessing

#### numerical and categoricals splitting values for screen_res

In [390]:
def clean_screen_resolution(df):
    
    res = df['screen_res'].str.extract(r'(\d+)\s*x\s*(\d+)', expand=True)

    # makes x and y res for numerical sake for better model
    df['x_res'] = res[0].astype(float)
    df['y_res'] = res[1].astype(float)

    # pixels per inch
    df['ppi'] = np.sqrt(df['x_res']**2 + df['y_res']**2) / df['inches']

    df['touchscreen'] = df['screen_res'].apply(
        lambda x: 1 if 'touch' in x.lower() else 0
    )

    # 4️⃣ Extract panel/display type
    #    We look for specific keywords and assign a simplified label.
    def get_panel_type(text):
        if not isinstance(text, str):
            return 'ips'
        text = text.lower()
        if 'ips' in text:
            return 'ips'
        elif 'retina' in text:
            return 'retina'
        elif 'oled' in text:
            return 'oled'
        else:
            return 'ips'

    df['panel_type'] = df['screen_res'].apply(get_panel_type)

    df = df.drop(["screen_res"], axis=1)    

    return df


In [391]:

df = clean_screen_resolution(df)

In [392]:
df.sample(5)

Unnamed: 0,company,type,inches,cpu_company,cpu_type,cpu_speed,ram,memory,gpu_company,gpu_type,weight,price,x_res,y_res,ppi,touchscreen,panel_type
444,HP,Gaming,15.6,Intel,Core i7 7700HQ,2.8,8,128GB SSD + 1TB HDD,Nvidia,GeForce GTX 1050,2.2,1099.0,1920.0,1080.0,141.211998,0,ips
526,Lenovo,Notebook,15.6,Intel,Core i7 7500U,2.7,4,1TB HDD,Nvidia,GeForce 920MX,2.2,799.0,1920.0,1080.0,141.211998,0,ips
1257,Dell,Notebook,15.6,Intel,Core i3 6006U,2.0,4,500GB HDD,Intel,HD Graphics 520,2.29,490.0,1366.0,768.0,100.45467,0,ips
217,HP,Notebook,14.0,Intel,Core i7 8550U,1.8,8,256GB SSD,Nvidia,GeForce 930MX,1.63,1031.0,1920.0,1080.0,157.350512,0,ips
392,Acer,Notebook,15.6,Intel,Core i5 8250U,1.6,12,1TB HDD,Nvidia,GeForce MX130,2.2,693.99,1366.0,768.0,100.45467,0,ips


#### numerical values for memory

In [393]:

def clean_memory(df):
    df['memory'] = df['memory'].str.lower()

    # tb to gb
    df['memory'] = df['memory'].str.replace(
        r'(\d+(?:\.\d+)?)\s*tb',
        lambda m: f"{int(float(m.group(1)) * 1024)}gb",
        regex=True
    )

    df['ssd_gb'] = (
        df['memory']
        .str.findall(r'(\d+)\s*gb\s*(?:ssd|flash)')
        .apply(lambda x: int(x[0]) if x else 0)
    )

    # --- HDD ---
    df['hdd_gb'] = (
        df['memory']
        .str.findall(r'(\d+)\s*gb\s*hdd')
        .apply(lambda x: int(x[0]) if x else 0)
    )

    df = df.drop(['memory'], axis=1)
    return df



In [394]:
df = clean_memory(df)

In [395]:
df['total_memory'] = df['ssd_gb'] + df['hdd_gb']

In [396]:
df.sample(5)

Unnamed: 0,company,type,inches,cpu_company,cpu_type,cpu_speed,ram,gpu_company,gpu_type,weight,price,x_res,y_res,ppi,touchscreen,panel_type,ssd_gb,hdd_gb,total_memory
548,HP,Notebook,15.6,Intel,Core i3 6006U,2.0,4,Intel,HD Graphics 520,1.86,397.0,1920.0,1080.0,141.211998,0,ips,0,500,500
1263,Acer,Notebook,15.6,Intel,Celeron Dual Core N3060,1.6,4,Intel,HD Graphics 400,2.4,289.0,1366.0,768.0,100.45467,0,ips,0,500,500
963,Toshiba,Ultrabook,12.5,Intel,Core i7 7500U,2.7,8,Intel,HD Graphics 620,1.1,1790.0,1920.0,1080.0,176.232574,1,ips,512,0,512
606,Toshiba,Notebook,15.6,Intel,Core i7 6500U,2.5,8,Intel,HD Graphics 520,2.2,1399.0,1920.0,1080.0,141.211998,0,ips,256,0,256
95,Acer,2 in 1 Convertible,13.3,Intel,Core i5 8250U,1.6,8,Intel,UHD Graphics 620,1.5,847.0,1920.0,1080.0,165.632118,1,ips,256,0,256


#### Clean cpu_type

In [397]:
df['cpu_type'] = df['cpu_type'].str.strip().str.title()
df['cpu_series'] = df['cpu_type'].str.extract(r'([A-Za-z0-9\-]+ ?[iI]?\d?)')
df = df.drop(['cpu_type'], axis=1)


#### clean gpu_type by only extracting numbers

In [398]:
df['gpu_model_num'] = df['gpu_type'].str.extract(r'(\d+)')
# if nan
df['gpu_model_num'].fillna(df['gpu_model_num'].median(), inplace=True)


TypeError: Cannot convert ['640' '6000' '620' ... nan '5' nan] to numeric

#### Encoding categories

In [None]:
df.head(10)

Unnamed: 0,company,type,inches,cpu_company,cpu_speed,ram,gpu_company,gpu_type,weight,price,x_res,y_res,ppi,touchscreen,panel_type,ssd_gb,hdd_gb,total_memory,cpu_series
0,Apple,Ultrabook,13.3,Intel,2.3,8,Intel,Iris Plus Graphics 640,1.37,1339.69,2560.0,1600.0,226.983005,0,ips,128,0,128,Core I5
1,Apple,Ultrabook,13.3,Intel,1.8,8,Intel,HD Graphics 6000,1.34,898.94,1440.0,900.0,127.67794,0,ips,128,0,128,Core I5
2,HP,Notebook,15.6,Intel,2.5,8,Intel,HD Graphics 620,1.86,575.0,1920.0,1080.0,141.211998,0,ips,256,0,256,Core I5
3,Apple,Ultrabook,15.4,Intel,2.7,16,AMD,Radeon Pro 455,1.83,2537.45,2880.0,1800.0,220.534624,0,ips,512,0,512,Core I7
4,Apple,Ultrabook,13.3,Intel,3.1,8,Intel,Iris Plus Graphics 650,1.37,1803.6,2560.0,1600.0,226.983005,0,ips,256,0,256,Core I5
5,Acer,Notebook,15.6,AMD,3.0,4,AMD,Radeon R5,2.1,400.0,1366.0,768.0,100.45467,0,ips,0,500,500,A9-Series 9
6,Apple,Ultrabook,15.4,Intel,2.2,16,Intel,Iris Pro Graphics,2.04,2139.97,2880.0,1800.0,220.534624,0,ips,256,0,256,Core I7
7,Apple,Ultrabook,13.3,Intel,1.8,8,Intel,HD Graphics 6000,1.34,1158.7,1440.0,900.0,127.67794,0,ips,256,0,256,Core I5
8,Asus,Ultrabook,14.0,Intel,1.8,16,Nvidia,GeForce MX150,1.3,1495.0,1920.0,1080.0,157.350512,0,ips,512,0,512,Core I7
9,Acer,Ultrabook,14.0,Intel,1.6,8,Intel,UHD Graphics 620,1.6,770.0,1920.0,1080.0,157.350512,0,ips,256,0,256,Core I5


In [None]:


# companies
df = pd.get_dummies(df, dtype=int, columns=['company'], drop_first=True)

#type
df = pd.get_dummies(df, dtype=int, columns=['type'], drop_first=True)

#cpu companies
df = pd.get_dummies(df, dtype=int, columns=['cpu_company'], drop_first=True)

#gpu companies
df = pd.get_dummies(df, dtype=int, columns=['gpu_company'], drop_first=True)

# cpu series

In [None]:
df.head()


Unnamed: 0,inches,cpu_speed,ram,gpu_type,weight,price,x_res,y_res,ppi,touchscreen,...,type_Gaming,type_Netbook,type_Notebook,type_Ultrabook,type_Workstation,cpu_company_Intel,cpu_company_Samsung,gpu_company_ARM,gpu_company_Intel,gpu_company_Nvidia
0,13.3,2.3,8,Iris Plus Graphics 640,1.37,1339.69,2560.0,1600.0,226.983005,0,...,0,0,0,1,0,1,0,0,1,0
1,13.3,1.8,8,HD Graphics 6000,1.34,898.94,1440.0,900.0,127.67794,0,...,0,0,0,1,0,1,0,0,1,0
2,15.6,2.5,8,HD Graphics 620,1.86,575.0,1920.0,1080.0,141.211998,0,...,0,0,1,0,0,1,0,0,1,0
3,15.4,2.7,16,Radeon Pro 455,1.83,2537.45,2880.0,1800.0,220.534624,0,...,0,0,0,1,0,1,0,0,0,0
4,13.3,3.1,8,Iris Plus Graphics 650,1.37,1803.6,2560.0,1600.0,226.983005,0,...,0,0,0,1,0,1,0,0,1,0


In [None]:
df.iloc[2][[col for col in df.columns if 'company_' in col]]



company_Apple          0
company_Asus           0
company_Chuwi          0
company_Dell           0
company_Fujitsu        0
company_Google         0
company_HP             1
company_Huawei         0
company_LG             0
company_Lenovo         0
company_MSI            0
company_Mediacom       0
company_Microsoft      0
company_Razer          0
company_Samsung        0
company_Toshiba        0
company_Vero           0
company_Xiaomi         0
cpu_company_Intel      1
cpu_company_Samsung    0
gpu_company_ARM        0
gpu_company_Intel      1
gpu_company_Nvidia     0
Name: 2, dtype: object