## Uncleaned Laptop Price dataset

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
df = pd.read_csv('laptopData.csv')

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1273 non-null   float64
 1   Company           1273 non-null   object 
 2   TypeName          1273 non-null   object 
 3   Inches            1273 non-null   object 
 4   ScreenResolution  1273 non-null   object 
 5   Cpu               1273 non-null   object 
 6   Ram               1273 non-null   object 
 7   Memory            1273 non-null   object 
 8   Gpu               1273 non-null   object 
 9   OpSys             1273 non-null   object 
 10  Weight            1273 non-null   object 
 11  Price             1273 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB
None


In [5]:
print(df.head())

   Unnamed: 0 Company   TypeName Inches                    ScreenResolution  \
0         0.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   
1         1.0   Apple  Ultrabook   13.3                            1440x900   
2         2.0      HP   Notebook   15.6                   Full HD 1920x1080   
3         3.0   Apple  Ultrabook   15.4  IPS Panel Retina Display 2880x1800   
4         4.0   Apple  Ultrabook   13.3  IPS Panel Retina Display 2560x1600   

                          Cpu   Ram               Memory  \
0        Intel Core i5 2.3GHz   8GB            128GB SSD   
1        Intel Core i5 1.8GHz   8GB  128GB Flash Storage   
2  Intel Core i5 7200U 2.5GHz   8GB            256GB SSD   
3        Intel Core i7 2.7GHz  16GB            512GB SSD   
4        Intel Core i5 3.1GHz   8GB            256GB SSD   

                            Gpu  OpSys  Weight        Price  
0  Intel Iris Plus Graphics 640  macOS  1.37kg   71378.6832  
1        Intel HD Graphics 6000  macOS  

In [6]:
print(df.isnull().sum())

Unnamed: 0          30
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64


In [7]:
print(df.describe())

        Unnamed: 0          Price
count  1273.000000    1273.000000
mean    652.674784   59955.814073
std     376.493027   37332.251005
min       0.000000    9270.720000
25%     327.000000   31914.720000
50%     652.000000   52161.120000
75%     980.000000   79333.387200
max    1302.000000  324954.720000


In [8]:
print(df.dtypes)

Unnamed: 0          float64
Company              object
TypeName             object
Inches               object
ScreenResolution     object
Cpu                  object
Ram                  object
Memory               object
Gpu                  object
OpSys                object
Weight               object
Price               float64
dtype: object


## Remove/Handle Missing Values

In [9]:
print("Missing values before cleaning:")
print(df.isnull().sum())

Missing values before cleaning:
Unnamed: 0          30
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64


In [10]:
missingT = 0.7 * len(df)
df = df.dropna(thresh=missingT, axis=1)

In [11]:
numericalC = df.select_dtypes(include=['int64', 'float64']).columns
for col in numericalC:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [12]:
categoricalC = df.select_dtypes(include=['object']).columns
for col in categoricalC:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [13]:
print("\nMissing values after cleaning:")
print(df.isnull().sum())


Missing values after cleaning:
Unnamed: 0          0
Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64


## Remove Duplicated Values

In [14]:
print(f"Before removing duplicates: {df.shape}")

Before removing duplicates: (1303, 12)


In [15]:
df.drop_duplicates(inplace=True)

In [18]:
df.sort_values('TypeName', inplace=True)
df.drop_duplicates(subset=['TypeName', 'Price'], keep='first', inplace=True)

In [19]:
print(f"After removing duplicates: {df.shape}")

After removing duplicates: (934, 12)


## Drop Unnecessary Columns

In [20]:
print(df.columns)

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')


In [21]:
columnsDrop = ['Unnamed: 0', 'Company']
df.drop(columns=[col for col in columnsDrop if col in df.columns], inplace=True)

In [22]:
print(df.columns)

Index(['TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu',
       'OpSys', 'Weight', 'Price'],
      dtype='object')


## Encoding Categorical Variables

In [30]:
label_encoder = LabelEncoder()
ordinal_cols = ['Memory']  
for col in ordinal_cols:
    df[col+'_encoded'] = label_encoder.fit_transform(df[col])

In [31]:
nominal_cols = ['TypeName', 'Ram', 'Inches']
df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

In [32]:
print("\nEncoded columns:")
print(df.filter(regex='_encoded|_').columns)


Encoded columns:
Index(['Memory_encoded', 'TypeName_Gaming', 'TypeName_Netbook',
       'TypeName_Notebook', 'TypeName_Ultrabook', 'TypeName_Workstation',
       'Ram_16GB', 'Ram_24GB', 'Ram_2GB', 'Ram_32GB', 'Ram_4GB', 'Ram_64GB',
       'Ram_6GB', 'Ram_8GB', 'Inches_11.6', 'Inches_12', 'Inches_12.3',
       'Inches_12.5', 'Inches_13.3', 'Inches_13.5', 'Inches_13.9', 'Inches_14',
       'Inches_15', 'Inches_15.4', 'Inches_15.6', 'Inches_17', 'Inches_17.3',
       'Inches_18.4', 'Inches_24', 'Inches_25.6', 'Inches_31.6', 'Inches_35.6',
       'Inches_?'],
      dtype='object')


## Scaling Numerical Values

In [48]:
numerical_cols = ['Price', 'Weight']  

In [49]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

ValueError: could not convert string to float: '?'

In [50]:
print("Unique values in Weight column before cleaning:")
print(df['Weight'].unique())

Unique values in Weight column before cleaning:
['1.28kg' '1.38kg' '1.71kg' '1.8kg' '1.3kg' '2.08kg' '0.69kg' '1.63kg'
 '1.4kg' '1.5kg' '2.19kg' '1.62kg' '1.22kg' '1.29kg' '2.26kg' '1.25kg'
 '1.45kg' '1.2kg' '1.42kg' '1.37kg' '2.77kg' '1.1kg' '1.55kg' '1.34kg'
 '1.39kg' '2.2kg' '1.24kg' '1.58kg' '1.36kg' '1.48kg' '1.27kg' '1.56kg'
 '1.74kg' '1.6kg' '1.16kg' '2.67kg' '2kg' '2.1kg' '2.3kg' '2.09kg' '2.8kg'
 '3kg' '2.9kg' '2.7kg' '2.59kg' '2.45kg' '2.34kg' '3.31kg' '2.43kg'
 '1.91kg' '2.4kg' '2.62kg' '2.65kg' '4.4kg' '3.2kg' '1.95kg' '2.56kg'
 '3.35kg' '4.42kg' '4.6kg' '4.14kg' '4.5kg' '3.58kg' '3.49kg' '2.5kg'
 '4.7kg' '3.6kg' '4.3kg' '3.78kg' '1.7kg' '4.2kg' '4.0kg' '3.52kg' '2.6kg'
 '2.591kg' '2.94kg' '2.73kg' '4kg' '3.25kg' '3.21kg' '4.36kg' '4.33kg'
 '1.99kg' '2.24kg' '1.26kg' '1.17kg' '0.97kg' '1.35kg' '1.59kg' '1.23kg'
 '1.86kg' '1.44kg' '1.49kg' '1.89kg' '2.71kg' '1.9kg' '2.13kg' '1.96kg'
 '2.02kg' '1.65kg' '2.0kg' '2.06kg' '1.90kg' '1.85kg' '2.03kg' '2.16kg'
 '5.4kg' '2.33kg' '2.

In [52]:
medianWeight = df.loc[df['Weight'] != '?', 'Weight'].str.replace('kg', '').astype(float).median()
df['Weight'] = df['Weight'].replace('?', f'{medianWeight}kg')

In [53]:
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

In [54]:
print("\nWeight column after cleaning:")
print(df['Weight'].describe())


Weight column after cleaning:
count    934.000000
mean       2.072695
std        0.850494
min        0.000200
25%        1.482500
50%        2.000000
75%        2.310000
max       11.100000
Name: Weight, dtype: float64


In [58]:
X = pd.get_dummies(df.drop('Price', axis=1), drop_first=True)
y = df['Price']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("--- Linear Regression Results ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

--- Linear Regression Results ---
Mean Squared Error (MSE): 316641571.51
R-squared (R2): 0.79
