In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel("cleaned_smartprix_laptop3.xlsx")

# Pre Processing

### Encode Categorical Variables

#### Label Encoding: Convert categories to numeric labels if needed.
#### One-Hot Encoding: Convert categorical variables into a set of binary variables.

In [4]:
# Scale Numerical Features

# Standardization: Scale features to have a mean of 0 and standard deviation of 1.
# Normalization: Scale features to a range between 0 and 1.

In [5]:
df.columns

Index(['productName', 'price', 'rating', 'specScore', 'features', 'brand',
       'processor', 'storage', 'cores', 'ram', 'screen_size', 'Warranty', 'OS',
       'graphics', 'storage_type', 'ram_type', 'resolution',
       'graphics_category', 'resolution_category'],
      dtype='object')

In [6]:
#removing unneccasary cols

df = df[['price', 'rating', 'specScore','brand', 'processor', 'storage', 'ram', 'screen_size', 'Warranty', 'OS', 'storage_type', 'ram_type', 'graphics_category', 'resolution_category']]

In [7]:
df.head()

Unnamed: 0,price,rating,specScore,brand,processor,storage,ram,screen_size,Warranty,OS,storage_type,ram_type,graphics_category,resolution_category
0,69990,4.6,69,Asus,Intel Core i5,512,16,16.0,12,Windows 11,SSD,DDR4,NVIDIA,Widescreen (16:10)
1,74990,4.05,71,Asus,Intel Core i5,1024,16,16.0,12,Windows 11,SSD,DDR4,NVIDIA,Widescreen (16:10)
2,28660,4.2,62,Lenovo,AMD Ryzen 3,512,8,15.6,12,Windows 11,SSD,DDR4,Other/Unknown,Standard HD (16:9)
3,57990,4.75,61,HP,Intel Core i5,512,16,15.6,12,Windows 11,SSD,DDR4,NVIDIA,Standard HD (16:9)
4,24990,4.05,54,Acer,Intel Core i3,512,8,14.0,12,Windows 11,SSD,DDR4,Intel,Standard HD (16:9)


In [8]:
#when there is order in categories : apply label else apply ohe

# brand - ohe
# processor - label
# OS - ohe
# storage_type - label
# ram_type - label
# graphics_category - label
# resolution_category - label

- When dealing with categorical variables in machine learning, it's essential to choose the right encoding technique based on whether the categories have an inherent order (ordinal) or not (nominal). Here’s a step-by-step approach to handling this:

    1. Identify Ordinal vs. Nominal Categories
    Ordinal: Categories with a meaningful order but no fixed interval between them (e.g., low, medium, high).
    Nominal: Categories with no meaningful order (e.g., brand, color).

    2. Apply Label Encoding for Ordinal Data
    Label encoding assigns a unique integer to each category and is suitable for ordinal data because it retains the order.

    3. Apply One-Hot Encoding for Nominal Data
    One-hot encoding converts categorical variables into binary columns, which is suitable for nominal data because it does not impose any ordinal relationships.

### OHE for brand & OS since they dont have any order

In [9]:
# Columns to encode
columns_to_encode = ['brand', 'OS']

# Separate columns to encode from the rest
df_to_encode = df[columns_to_encode]
df_rest = df.drop(columns=columns_to_encode)

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df_to_encode)

# Combine encoded columns with the rest of the DataFrame
df_final = pd.concat([df_rest, df_encoded], axis=1)

In [10]:
df_final.head(2)

Unnamed: 0,price,rating,specScore,processor,storage,ram,screen_size,Warranty,storage_type,ram_type,...,OS_Mac,OS_Mac 10.15.3,OS_Mac Catalina,OS_No,OS_Ubuntu,OS_Windows,OS_Windows 10,OS_Windows 10.1,OS_Windows 11,OS_Windows 11.1
0,69990,4.6,69,Intel Core i5,512,16,16.0,12,SSD,DDR4,...,0,0,0,0,0,0,0,0,1,0
1,74990,4.05,71,Intel Core i5,1024,16,16.0,12,SSD,DDR4,...,0,0,0,0,0,0,0,0,1,0


In [11]:
df_final.shape

(1018, 38)

### Label encode for the rest

In [12]:
df_final['processor'].unique()

array(['Intel Core i5', 'AMD Ryzen 3', 'Intel Core i3', 'AMD Ryzen 7',
       'Intel Core i7', 'Apple M Series', 'Intel Core i9', 'AMD Ryzen 5',
       'Others', 'AMD Ryzen 9', 'Intel Core Ultra'], dtype=object)

In [13]:
# Define the ranking order for processors
rank_order = [
    'Intel Core i5',
    'AMD Ryzen 3',
    'Intel Core i3',
    'AMD Ryzen 7',
    'Intel Core i7',
    'Apple M Series',
    'Intel Core i9',
    'AMD Ryzen 5',
    'Others',
    'AMD Ryzen 9',
    'Intel Core Ultra'
]

# Create a mapping dictionary
processor_mapping = {processor: idx for idx, processor in enumerate(rank_order)}

# Apply the mapping to the 'processor' column
df_final['processor_encoded'] = df_final['processor'].map(processor_mapping)

In [14]:
df_final.sample(5)

Unnamed: 0,price,rating,specScore,processor,storage,ram,screen_size,Warranty,storage_type,ram_type,...,OS_Mac 10.15.3,OS_Mac Catalina,OS_No,OS_Ubuntu,OS_Windows,OS_Windows 10,OS_Windows 10.1,OS_Windows 11,OS_Windows 11.1,processor_encoded
70,80990,4.55,61,Intel Core i5,512,16,15.6,12,SSD,DDR4,...,0,0,0,0,0,0,0,1,0,0
44,19999,4.2,50,Intel Core i5,512,8,14.1,12,SSD,DDR4,...,0,0,0,0,0,0,0,1,0,0
587,58491,4.05,63,Intel Core i5,512,16,15.6,12,SSD,DDR4,...,0,0,0,0,0,0,0,1,0,0
485,65990,4.7,59,AMD Ryzen 5,1024,16,15.6,12,SSD,DDR4,...,0,0,0,0,0,0,0,1,0,7
632,58990,4.65,57,Intel Core i5,512,8,14.0,12,SSD,DDR4,...,0,0,0,0,0,0,0,1,0,0


In [15]:
df_final['storage_type'].unique()

array(['SSD', 'hard disk', 'HDD', 'UFS'], dtype=object)

In [16]:
# Define the ranking order for storage types
storage_rank_order = [
    'SSD',        # Highest rank
    'UFS',        # Second highest
    'HDD',        # Third highest
    'hard disk'   # Lowest rank
]

# Create the mapping dictionary
storage_mapping = {storage: idx for idx, storage in enumerate(storage_rank_order)}

# Apply the mapping to the 'storage_type' column
df_final['storage_type_encoded'] = df_final['storage_type'].map(storage_mapping)

In [17]:
df_final

Unnamed: 0,price,rating,specScore,processor,storage,ram,screen_size,Warranty,storage_type,ram_type,...,OS_Mac Catalina,OS_No,OS_Ubuntu,OS_Windows,OS_Windows 10,OS_Windows 10.1,OS_Windows 11,OS_Windows 11.1,processor_encoded,storage_type_encoded
0,69990,4.60,69,Intel Core i5,512,16,16.0,12,SSD,DDR4,...,0,0,0,0,0,0,1,0,0,0
1,74990,4.05,71,Intel Core i5,1024,16,16.0,12,SSD,DDR4,...,0,0,0,0,0,0,1,0,0,0
2,28660,4.20,62,AMD Ryzen 3,512,8,15.6,12,SSD,DDR4,...,0,0,0,0,0,0,1,0,1,0
3,57990,4.75,61,Intel Core i5,512,16,15.6,12,SSD,DDR4,...,0,0,0,0,0,0,1,0,0,0
4,24990,4.05,54,Intel Core i3,512,8,14.0,12,SSD,DDR4,...,0,0,0,0,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,149990,4.50,77,Intel Core i7,1024,16,16.0,12,SSD,DDR5,...,0,0,0,0,0,0,1,0,4,0
1014,58990,4.10,61,AMD Ryzen 5,1024,16,15.6,12,SSD,DDR4,...,0,0,0,0,0,0,1,0,7,0
1015,33999,4.30,53,Intel Core i3,256,8,15.6,12,SSD,DDR4,...,0,0,0,0,0,0,1,0,2,0
1016,31890,4.25,51,AMD Ryzen 3,512,8,15.6,12,SSD,LPDDR5,...,0,0,0,0,0,0,1,0,1,0


In [18]:
df_final['ram_type'].unique()

array(['DDR4', 'DDR5', 'LPDDR5', 'LPDDR5X', nan, 'LPDDR4X', 'DDR6',
       'DDR5 SD', 'LPDDR4', 'Unified Memory', 'LPDDRx4', 'DDR3', 'LPDDR3',
       '\u200eLPDDR5X'], dtype=object)

In [22]:
# Define the ranking order for RAM types
ram_rank_order = [
    'DDR4',
    'DDR5',
    'LPDDR5',
    'LPDDR5X',
    'LPDDR4X',
    'DDR6',
    'DDR5 SD',
    'LPDDR4',
    'Unified Memory',
    'LPDDRx4',
    'DDR3',
    'LPDDR3'
]

# Normalize ram_type by replacing special characters and trimming spaces
df_final['ram_type_normalized'] = df_final['ram_type'].astype(str).str.replace(r'\u200e', '', regex=True).str.strip()

# Handle missing values in ranking
ram_rank_order.append('nan')  # Add a rank for missing values

# Create the mapping dictionary
ram_mapping = {ram: idx for idx, ram in enumerate(ram_rank_order)}

# Apply the mapping to the normalized 'ram_type' column
df_final['ram_type_encoded'] = df_final['ram_type_normalized'].map(lambda x: ram_mapping.get(x, ram_mapping['nan']))

In [23]:
df_final.sample(4)

Unnamed: 0,price,rating,specScore,processor,storage,ram,screen_size,Warranty,storage_type,ram_type,...,OS_Ubuntu,OS_Windows,OS_Windows 10,OS_Windows 10.1,OS_Windows 11,OS_Windows 11.1,processor_encoded,storage_type_encoded,ram_type_normalized,ram_type_encoded
651,87990,4.7,61,Intel Core i7,256,8,13.3,12,SSD,DDR4,...,0,0,1,0,0,0,4,0,DDR4,0
932,107990,4.1,63,Intel Core Ultra,1024,16,14.0,12,SSD,LPDDR5X,...,0,0,0,0,1,0,10,0,LPDDR5X,3
730,54990,4.05,62,Intel Core i5,512,16,16.0,12,SSD,DDR4,...,0,0,0,0,1,0,0,0,DDR4,0
88,109670,4.25,47,AMD Ryzen 9,1024,32,14.0,12,SSD,,...,0,0,0,0,1,0,9,0,,12


In [24]:
df_final['graphics_category'].unique()

array(['NVIDIA', 'Other/Unknown', 'Intel', 'Qualcomm and ARM'],
      dtype=object)

In [25]:
# Define the ranking order for graphics categories
graphics_rank_order = [
    'NVIDIA',
    'Intel',
    'Qualcomm and ARM',
    'Other/Unknown'
]

# Create the mapping dictionary
graphics_mapping = {category: idx for idx, category in enumerate(graphics_rank_order)}

# Apply the mapping to the 'graphics_category' column
df_final['graphics_category_encoded'] = df_final['graphics_category'].map(lambda x: graphics_mapping.get(x, -1))

In [26]:
df_final.sample(4)

Unnamed: 0,price,rating,specScore,processor,storage,ram,screen_size,Warranty,storage_type,ram_type,...,OS_Windows,OS_Windows 10,OS_Windows 10.1,OS_Windows 11,OS_Windows 11.1,processor_encoded,storage_type_encoded,ram_type_normalized,ram_type_encoded,graphics_category_encoded
101,62849,4.35,70,Intel Core i7,512,16,15.6,24,SSD,DDR4,...,0,0,0,1,0,4,0,DDR4,0,0
883,102999,4.75,68,Intel Core Ultra,512,16,14.0,12,SSD,LPDDR5,...,0,0,0,1,0,10,0,LPDDR5,2,1
78,84990,4.25,73,AMD Ryzen 7,1024,16,15.6,24,SSD,DDR4,...,0,0,0,1,0,3,0,DDR4,0,0
701,180990,4.45,53,Others,1024,16,13.0,12,SSD,LPDDR5X,...,0,0,0,1,0,8,0,LPDDR5X,3,2


In [27]:
df_final['resolution_category'].unique()

array(['Widescreen (16:10)', 'Standard HD (16:9)', 'Quad HD or higher',
       'Widescreen or Custom', 'Other'], dtype=object)

In [28]:
# Define the ranking order for resolution categories
resolution_rank_order = [
    'Widescreen (16:10)',
    'Standard HD (16:9)',
    'Quad HD or higher',
    'Widescreen or Custom',
    'Other'
]

# Create the mapping dictionary
resolution_mapping = {category: idx for idx, category in enumerate(resolution_rank_order)}

# Apply the mapping to the 'resolution_category' column
df_final['resolution_category_encoded'] = df_final['resolution_category'].map(lambda x: resolution_mapping.get(x, -1))

In [29]:
df_final.sample(5)

Unnamed: 0,price,rating,specScore,processor,storage,ram,screen_size,Warranty,storage_type,ram_type,...,OS_Windows 10,OS_Windows 10.1,OS_Windows 11,OS_Windows 11.1,processor_encoded,storage_type_encoded,ram_type_normalized,ram_type_encoded,graphics_category_encoded,resolution_category_encoded
918,50750,4.25,59,Intel Core i7,512,16,15.6,12,SSD,DDR4,...,0,0,1,0,4,0,DDR4,0,1,1
446,50690,4.45,57,Intel Core i5,512,16,15.6,12,SSD,DDR4,...,0,0,1,0,0,0,DDR4,0,1,1
919,157990,4.75,80,Intel Core Ultra,1024,32,16.0,24,SSD,LPDDR5,...,0,0,1,0,10,0,LPDDR5,2,0,2
841,198490,4.65,86,Intel Core i9,1024,32,16.0,12,SSD,DDR5,...,0,0,1,0,6,0,DDR5,1,0,2
367,152990,4.05,63,Intel Core Ultra,1024,16,15.6,12,SSD,LPDDR5X,...,0,0,1,0,10,0,LPDDR5X,3,1,2


In [30]:
df_final.columns

Index(['price', 'rating', 'specScore', 'processor', 'storage', 'ram',
       'screen_size', 'Warranty', 'storage_type', 'ram_type',
       'graphics_category', 'resolution_category', 'brand_Acer', 'brand_Apple',
       'brand_Asus', 'brand_Dell', 'brand_HP', 'brand_Infinix', 'brand_Lenovo',
       'brand_MSI', 'brand_Microsoft', 'brand_Other', 'brand_Samsung',
       'OS_Android 11 ', 'OS_Chrome ', 'OS_Chrome  ', 'OS_D ', 'OS_Linux ',
       'OS_Mac ', 'OS_Mac 10.15.3 ', 'OS_Mac Catalina ', 'OS_No ',
       'OS_Ubuntu ', 'OS_Windows ', 'OS_Windows 10 ', 'OS_Windows 10  ',
       'OS_Windows 11 ', 'OS_Windows 11  ', 'processor_encoded',
       'storage_type_encoded', 'ram_type_normalized', 'ram_type_encoded',
       'graphics_category_encoded', 'resolution_category_encoded'],
      dtype='object')

### drop orginal cols that have been encoded

In [31]:
columns_to_drop = [
    'processor', 'storage_type', 'ram_type', 'graphics_category', 'resolution_category'
] + [col for col in df_final.columns if col.startswith('OS_')]

df_final_cleaned = df_final.drop(columns=columns_to_drop)

In [32]:
df_final_cleaned.columns

Index(['price', 'rating', 'specScore', 'storage', 'ram', 'screen_size',
       'Warranty', 'brand_Acer', 'brand_Apple', 'brand_Asus', 'brand_Dell',
       'brand_HP', 'brand_Infinix', 'brand_Lenovo', 'brand_MSI',
       'brand_Microsoft', 'brand_Other', 'brand_Samsung', 'processor_encoded',
       'storage_type_encoded', 'ram_type_normalized', 'ram_type_encoded',
       'graphics_category_encoded', 'resolution_category_encoded'],
      dtype='object')

In [34]:
df_final_cleaned = df_final_cleaned.drop(columns=['ram_type_normalized'])

### Feature Scaling

In [35]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Select numerical features to scale
numerical_features = ['price', 'rating', 'specScore', 'storage', 'ram', 'screen_size', 'Warranty', 'processor_encoded', 'storage_type_encoded', 'ram_type_encoded', 'graphics_category_encoded', 'resolution_category_encoded']

# Scale the numerical features
df_final_cleaned[numerical_features] = scaler.fit_transform(df_final_cleaned[numerical_features])

In [36]:
df_final_cleaned

Unnamed: 0,price,rating,specScore,storage,ram,screen_size,Warranty,brand_Acer,brand_Apple,brand_Asus,...,brand_Lenovo,brand_MSI,brand_Microsoft,brand_Other,brand_Samsung,processor_encoded,storage_type_encoded,ram_type_encoded,graphics_category_encoded,resolution_category_encoded
0,-0.249136,0.953193,0.702534,-0.449944,0.094316,0.852221,-0.2673,0,0,1,...,0,0,0,0,0,-1.147358,-0.146382,-0.631853,-1.006425,-1.536546
1,-0.172194,-1.383977,0.881642,1.020794,0.094316,0.852221,-0.2673,0,0,1,...,0,0,0,0,0,-1.147358,-0.146382,-0.631853,-1.006425,-1.536546
2,-0.885139,-0.746567,0.075655,-0.449944,-0.877972,0.446511,-0.2673,0,0,0,...,1,0,0,0,0,-0.858960,-0.146382,-0.631853,1.980574,-0.291161
3,-0.433797,1.590603,-0.013899,-0.449944,0.094316,0.446511,-0.2673,0,0,0,...,0,0,0,0,0,-1.147358,-0.146382,-0.631853,-1.006425,-0.291161
4,-0.941614,-1.383977,-0.640779,-0.449944,-0.877972,-1.176331,-0.2673,1,0,0,...,0,0,0,0,0,-0.570563,-0.146382,-0.631853,-0.010759,-0.291161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,0.981935,0.528253,1.418967,1.020794,0.094316,0.852221,-0.2673,0,0,0,...,0,1,0,0,0,0.006233,-0.146382,-0.265551,-1.006425,0.954224
1014,-0.418409,-1.171507,-0.013899,1.020794,0.094316,0.446511,-0.2673,0,0,1,...,0,0,0,0,0,0.871425,-0.146382,-0.631853,-0.010759,-0.291161
1015,-0.802980,-0.321627,-0.730333,-1.185313,-0.877972,0.446511,-0.2673,0,0,0,...,1,0,0,0,0,-0.570563,-0.146382,-0.631853,-0.010759,-0.291161
1016,-0.835434,-0.534097,-0.909441,-0.449944,-0.877972,0.446511,-0.2673,0,0,0,...,1,0,0,0,0,-0.858960,-0.146382,0.100751,1.980574,-0.291161


## Split data

In [37]:
from sklearn.model_selection import train_test_split

# Define features and target variable
X = df_final_cleaned.drop('price', axis=1)  # Assuming 'price' is the target variable
y = df_final_cleaned['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model building

In [38]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_regression_models(X, y, degree=2):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize a dictionary to store results
    results = {}

    # 1. Linear Regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    results['Linear Regression'] = {
        'MSE': mean_squared_error(y_test, y_pred),
        'R^2': r2_score(y_test, y_pred)
    }
    
    # 2. Gradient Descent (SGDRegressor)
    sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
    sgd_reg.fit(X_train, y_train)
    y_pred = sgd_reg.predict(X_test)
    results['Gradient Descent'] = {
        'MSE': mean_squared_error(y_test, y_pred),
        'R^2': r2_score(y_test, y_pred)
    }

    # 3. Ridge Regression
    ridge_reg = Ridge(alpha=1.0)
    ridge_reg.fit(X_train, y_train)
    y_pred = ridge_reg.predict(X_test)
    results['Ridge Regression'] = {
        'MSE': mean_squared_error(y_test, y_pred),
        'R^2': r2_score(y_test, y_pred)
    }

    # 4. Lasso Regression
    lasso_reg = Lasso(alpha=0.1)
    lasso_reg.fit(X_train, y_train)
    y_pred = lasso_reg.predict(X_test)
    results['Lasso Regression'] = {
        'MSE': mean_squared_error(y_test, y_pred),
        'R^2': r2_score(y_test, y_pred)
    }

    # 5. Polynomial Regression
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X_train)
    poly_reg = LinearRegression()
    poly_reg.fit(X_poly, y_train)
    X_test_poly = poly.transform(X_test)
    y_pred = poly_reg.predict(X_test_poly)
    results['Polynomial Regression'] = {
        'MSE': mean_squared_error(y_test, y_pred),
        'R^2': r2_score(y_test, y_pred)
    }
    
    # Print Results
    for model, metrics in results.items():
        print(f"{model} - MSE: {metrics['MSE']:.4f}, R^2: {metrics['R^2']:.4f}")

    # Optionally: Visualize Polynomial Regression
    if degree == 2 and X.shape[1] == 1:  # Only for 1D features
        plt.figure(figsize=(10, 6))
        plt.scatter(X, y, color='blue', label='Data')
        plt.plot(X_test, y_pred, color='red', label='Polynomial Fit')
        plt.title('Polynomial Regression Fit')
        plt.xlabel('Feature')
        plt.ylabel('Target')
        plt.legend()
        plt.show()

In [40]:
X = df_final_cleaned.drop('price', axis=1)
y = df_final_cleaned['price']
evaluate_regression_models(X, y, degree=5)

Linear Regression - MSE: 0.1781, R^2: 0.8085
Gradient Descent - MSE: 0.2172, R^2: 0.7666
Ridge Regression - MSE: 0.1792, R^2: 0.8074
Lasso Regression - MSE: 0.3081, R^2: 0.6688
Polynomial Regression - MSE: 291.0227, R^2: -311.8270


### points to understand

1. understand linear & gd flow
2. understand ridge & lasso hyper parameter
3. understand polynomial regression flow
4. check options for feature selection