In [7]:
import pandas as pd
import numpy as np

# load the data
#UnicodeDecodeError, which means Python can't read the file using the default utf-8 encoding.

df = pd.read_csv('laptop_price.csv', encoding='latin1')


df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [11]:
# data cleaning and preprocssing

# check for missing values
print(df.isnull().sum())

# check dataset info and columns
print(df.info())

# check for duplicates
print("Duplicates: ",df.duplicated().sum())

# drop duplicates if any
df.drop_duplicates(inplace=True)

laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 no

In [19]:
# feature transform.
# ram, cpu, memory are categorical, we need to convert them

# convert ram to numeric

# ensuring 'Ram' is string before replacing
df['Ram']=df['Ram'].astype(str).str.replace('GB','').astype(int)


In [23]:
#convert weight to float
# we need to remove Kg before converting
df['Weight']=df['Weight'].astype(str).str.replace('kg','').astype(float)

In [None]:
# converting price to int
df['Price_euros']=df['Price_euros'].astype(int)

In [30]:
# extract screen size from 'Screen Resolution (e.g: 1920X1080)

df['ScreenResolution']=df['ScreenResolution'].str.replace('Touchscreen','')
df['ScreenResolution']=df['ScreenResolution'].str.replace('IPS','')
df['ScreenResolution']=df['ScreenResolution'].str.strip()
#removes any leading or trailing whitespace (spaces, tabs, newline characters) from the values in the ScreenResolution column
# 'e.g'
#' 1920x1080' → has leading space
#'1920x1080 ' → has trailing space



In [41]:
# extract width and height from the resolution

import re

def extract_width(res):
    match = re.search(r'(\d+)x', res)
    return int(match.group(1)) if match else None

def extract_height(res):
    match = re.search(r'x(\d+)', res)  # <- fixed here
    return int(match.group(1)) if match else None

df['X_res'] = df['ScreenResolution'].apply(extract_width)
df['Y_res'] = df['ScreenResolution'].apply(extract_height)

#(\d+)x: captures digits before the x (i.e., width)

#x(\d+): captures digits after the x (i.e., height)

In [44]:
# calculate pixels per inch
df['PPI']=((df['X_res']**2+df['Y_res']**2)**0.5/df['Inches']).astype(float)

#Higher PPI = Sharper screen
#Lower PPI = More pixelated display
# PPI= root((X_res)2 + (Y_res)2) / Screen size in Inches
# It combines resolution and screen size into one powerful feature.

In [45]:
# drop screen resolution and Inches after extracting info (optional)
# df.drop(['ScreenResolution','Inches],axis=1,inplace=True)


In [53]:
# simplify CPU brand names

df['Cpu_brand']=df['Cpu'].apply(lambda x: x.split()[0])

df['Cpu_brand'].replace({'Intel':'Intel','AMD':'AMD','Celeron':'Intel','Pentium': 'Intel'},inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cpu_brand'].replace({'Intel':'Intel','AMD':'AMD','Celeron':'Intel','Pentium': 'Intel'},inplace=True)


In [55]:
def process_memory(mem):
    try:
        mem = str(mem).replace('GB', '').replace('TB', '000')  # Normalize to GB
        ssd = 0
        hdd = 0
        if '+' in mem:
            parts = mem.split('+')
            for part in parts:
                if 'SSD' in part:
                    ssd += int(re.findall(r'\d+', part)[0])
                elif 'HDD' in part:
                    hdd += int(re.findall(r'\d+', part)[0])
                elif 'Flash Storage' in part:
                    ssd += int(re.findall(r'\d+', part)[0])
        else:
            if 'SSD' in mem:
                ssd = int(re.findall(r'\d+', mem)[0])
            elif 'HDD' in mem:
                hdd = int(re.findall(r'\d+', mem)[0])
            elif 'Flash Storage' in mem:
                ssd = int(re.findall(r'\d+', mem)[0])
        return pd.Series([ssd, hdd])
    except:
        return pd.Series([0, 0])  # default if parsing fails

# Apply safely
df[['SSD', 'HDD']] = df['Memory'].apply(process_memory)
df.drop('Memory', axis=1, inplace=True)


In [64]:
# model training

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

X=df.drop('Price_euros',axis=1)
y=df['Price_euros']


In [65]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:

cols_to_drop = ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Gpu', 'OpSys', 'Cpu_brand']
existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
df.drop(columns=existing_cols_to_drop, inplace=True)



In [122]:
X_train.dtypes
# all should be in int or float

laptop_ID             int64
Company              object
Product              object
TypeName             object
Inches              float64
ScreenResolution     object
Cpu                  object
Ram                   int64
Gpu                  object
OpSys                object
Weight              float64
X_res                 int64
Y_res                 int64
PPI                 float64
Cpu_brand            object
SSD                   int64
HDD                   int64
dtype: object

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Separate features and target
X = df.drop('Price_euros', axis=1)
y = df['Price_euros']

# Split train-test 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print('MAE:', mean_absolute_error(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))
print('R^2:', r2_score(y_test, y_pred))


MAE: 207.20914274656806
RMSE: 105316.32991113375
R^2: 0.7926485411568813


In [127]:
# Save the model

import pickle

# save model

with open('laptop_price_model.pkl','wb') as f:
    pickle.dump(model,f)
    
# save the dataframe columns
model_columns=list(X.columns)
with open('model_columns.pkl','wb') as f:
    pickle.dump(model_columns,f)