In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
import copy
import seaborn as sns
import tensorflow as tf
from sklearn.linear_model import LinearRegression

### Dataset:
IRON WOLF. (2024). *Laptop Price - dataset* [Data set]. 
Kaggle. https://www.kaggle.com/datasets/ironwolf437/laptop-price-dataset?resource=download


In [102]:
dataset_cols = [
    "company",
    "product",
    "type",
    "inches",
    "screen_res",
    "cpu_company",
    "cpu_type",
    "cpu_speed",
    "ram",
    "memory",
    "gpu_company",
    "gpu_type",
    "weight",
    "price"
]
df = pd.read_csv("data/laptop_price - dataset.csv").drop(["OpSys"], axis=1)

In [103]:
df.columns = dataset_cols

In [104]:
df.head()

Unnamed: 0,company,product,type,inches,screen_res,cpu_company,cpu_type,cpu_speed,ram,memory,gpu_company,gpu_type,weight,price
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,2.3,8,128GB SSD,Intel,Iris Plus Graphics 640,1.37,1339.69
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel,Core i5,1.8,8,128GB Flash Storage,Intel,HD Graphics 6000,1.34,898.94
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel,Core i5 7200U,2.5,8,256GB SSD,Intel,HD Graphics 620,1.86,575.0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel,Core i7,2.7,16,512GB SSD,AMD,Radeon Pro 455,1.83,2537.45
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,3.1,8,256GB SSD,Intel,Iris Plus Graphics 650,1.37,1803.6


### Clean Data

#### numerical and categoricals splitting values for screen_res

In [105]:
def clean_screen_resolution(df):
    
    res = df['screen_res'].str.extract(r'(\d+)\s*x\s*(\d+)', expand=True)

    # makes x and y res for numerical sake for better model
    df['x_res'] = res[0].astype(float)
    df['y_res'] = res[1].astype(float)

    # pixels per inch
    df['ppi'] = np.sqrt(df['x_res']**2 + df['y_res']**2) / df['inches']

    df['touchscreen'] = df['screen_res'].apply(
        lambda x: 1 if 'touch' in x.lower() else 0
    )

    # 4️⃣ Extract panel/display type
    #    We look for specific keywords and assign a simplified label.
    def get_panel_type(text):
        if not isinstance(text, str):
            return 'ips'
        text = text.lower()
        if 'ips' in text:
            return 'ips'
        elif 'retina' in text:
            return 'retina'
        elif 'oled' in text:
            return 'oled'
        else:
            return 'ips'

    df['panel_type'] = df['screen_res'].apply(get_panel_type)

    # get rid of the screen res since we put in other places
    df.drop('screen_res', axis=1)

    return df


In [106]:
df = clean_screen_resolution(df)

In [107]:
df.head()

Unnamed: 0,company,product,type,inches,screen_res,cpu_company,cpu_type,cpu_speed,ram,memory,gpu_company,gpu_type,weight,price,x_res,y_res,ppi,touchscreen,panel_type
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,2.3,8,128GB SSD,Intel,Iris Plus Graphics 640,1.37,1339.69,2560.0,1600.0,226.983005,0,ips
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel,Core i5,1.8,8,128GB Flash Storage,Intel,HD Graphics 6000,1.34,898.94,1440.0,900.0,127.67794,0,ips
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel,Core i5 7200U,2.5,8,256GB SSD,Intel,HD Graphics 620,1.86,575.0,1920.0,1080.0,141.211998,0,ips
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel,Core i7,2.7,16,512GB SSD,AMD,Radeon Pro 455,1.83,2537.45,2880.0,1800.0,220.534624,0,ips
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel,Core i5,3.1,8,256GB SSD,Intel,Iris Plus Graphics 650,1.37,1803.6,2560.0,1600.0,226.983005,0,ips


#### numerical values for memory