# Laptop Price Prediction — End-to-End

**Author:** Kartik

This notebook contains: EDA, Cleaning, Feature Engineering, Modeling (sklearn), evaluation, and saved artifacts.

Run top-to-bottom. If a package is missing use `!pip install <package>` in a cell.


In [None]:

# Imports (run once)
import warnings
warnings.filterwarnings('ignore')

import os, re, math, json
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pprint import pprint

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import joblib
print('Libraries loaded.')


In [None]:

# Paths
RAW_CSV = '/mnt/data/laptop_price - dataset.csv'
CLEANED_CSV = '/mnt/data/laptop_price.cleaned.csv'
RESULTS_CSV = '/mnt/data/model_results.csv'
PLOTS_DIR = '/mnt/data/plots'
MODEL_OUT = '/mnt/data/best_model_pipeline.joblib'

os.makedirs(PLOTS_DIR, exist_ok=True)
print('Paths set.')


In [None]:

# Read dataset
def safe_read_csv(path):
    for enc in ['utf-8','utf-8-sig','latin-1']:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    return pd.read_csv(path, engine='python')

df = safe_read_csv(RAW_CSV)
print('Shape:', df.shape)
display(df.head(8))


## EDA
Inspect columns, missing values, and summary statistics.

In [None]:

print('Columns:', df.columns.tolist())
display(df.isna().sum().sort_values(ascending=False).head(20))
display(df.describe(include='all').T)


## Cleaning & Feature Engineering

In [None]:

# Helper funcs
import numpy as np, re
def extract_resolution(x):
    if pd.isna(x): return (np.nan, np.nan)
    m = re.search(r'(\d{3,4})\s*[xX]\s*(\d{3,4})', str(x))
    if m: return int(m.group(1)), int(m.group(2))
    return (np.nan, np.nan)

def parse_storage(mem):
    d = {'SSD_GB':0,'HDD_GB':0,'Flash_GB':0,'Hybrid_GB':0,'SSHD_GB':0}
    if pd.isna(mem): return d
    parts = re.split(r'\s*\+\s*', str(mem))
    for p in parts:
        pm = re.search(r'(\d+)\s*TB', p, flags=re.I)
        if pm: size = int(pm.group(1))*1024
        else:
            pm2 = re.search(r'(\d+)\s*GB', p, flags=re.I)
            size = int(pm2.group(1)) if pm2 else 0
        if re.search(r'SSD', p, flags=re.I): d['SSD_GB'] += size
        elif re.search(r'SSHD', p, flags=re.I): d['SSHD_GB'] += size
        elif re.search(r'Hybrid', p, flags=re.I): d['Hybrid_GB'] += size
        elif re.search(r'Flash', p, flags=re.I): d['Flash_GB'] += size
        elif re.search(r'HDD', p, flags=re.I): d['HDD_GB'] += size
        else: d['HDD_GB'] += size
    return d

def compute_ppi(w,h,inch):
    try:
        if np.any(pd.isna([w,h,inch])) or inch==0: return np.nan
        return ((w**2 + h**2)**0.5)/inch
    except: return np.nan

# rename columns
df.columns = [c.strip().replace('\n',' ').replace('(GB)','GB').replace('(kg)','kg') for c in df.columns]
rename_map = {}
for c in df.columns:
    if c.lower().strip() == 'ram (gb)': rename_map[c] = 'RAM_GB'
    if c.lower().strip() == 'price (euro)': rename_map[c] = 'Price_Euro'
    if c.lower().strip() == 'weight (kg)': rename_map[c] = 'Weight_kg'
    if c.lower().strip().startswith('cpu_frequency'): rename_map[c] = 'CPU_Frequency'
df = df.rename(columns=rename_map)

# drop duplicates
df = df.drop_duplicates().reset_index(drop=True)

# resolution parsing
if 'ScreenResolution' in df.columns:
    df['ResWidth'], df['ResHeight'] = zip(*df['ScreenResolution'].map(extract_resolution))
else:
    df['ResWidth'] = df['ResHeight'] = np.nan

# PPI
df['PPI'] = [compute_ppi(w,h,inch) for w,h,inch in zip(df.get('ResWidth',[]), df.get('ResHeight',[]), df.get('Inches',[]))]

# Memory parse
if 'Memory' in df.columns:
    mem_df = df['Memory'].map(parse_storage).apply(pd.Series)
    df = pd.concat([df, mem_df], axis=1)

# numeric parse helper
def parse_first_number(x):
    if pd.isna(x): return np.nan
    m = re.search(r'([\d\.]+)', str(x))
    return float(m.group(1)) if m else np.nan

for col in ['RAM_GB','Weight_kg','CPU_Frequency']:
    if col in df.columns:
        df[col] = df[col].apply(parse_first_number)

# clip outliers
num_cols = [c for c in ['Inches','CPU_Frequency','RAM_GB','Weight_kg','Price_Euro','PPI','ResWidth','ResHeight',
                        'SSD_GB','HDD_GB','Flash_GB','Hybrid_GB','SSHD_GB'] if c in df.columns]
for c in num_cols:
    lo, hi = df[c].quantile([0.01,0.99]).values
    df[c] = df[c].clip(lo, hi)

# save cleaned
df.to_csv(CLEANED_CSV, index=False)
print('Saved cleaned to', CLEANED_CSV)
display(df.head())


## Plots

In [None]:

# save simple histograms and heatmap
numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if df[c].nunique()>1]

def save_hist(col):
    try:
        ax = df[col].dropna().plot(kind='hist', bins=30, title=f'Distribution: {col}')
        fig = ax.get_figure()
        path = os.path.join(PLOTS_DIR, f'hist_{col}.png')
        fig.tight_layout(); fig.savefig(path, dpi=150); plt.close(fig)
        return path
    except Exception as e:
        return None

plot_paths = []
for c in ['Price_Euro'] + [x for x in numeric_cols if x!='Price_Euro'][:6]:
    p = save_hist(c)
    if p: plot_paths.append(p)

try:
    corr = df[numeric_cols].corr(numeric_only=True)
    plt.figure(figsize=(8,6))
    plt.imshow(corr, aspect='auto')
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.xticks(range(len(corr)), corr.columns, rotation=90)
    plt.yticks(range(len(corr)), corr.columns)
    path = os.path.join(PLOTS_DIR,'corr_heatmap.png')
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()
    plot_paths.append(path)
except Exception as e:
    print('Heatmap failed:', e)

print('Saved plots:', plot_paths)


## Modeling

In [None]:

# Modeling pipeline and training
price_cols = [c for c in df.columns if re.search(r'price', c, flags=re.I)]
assert len(price_cols)>=1, 'No price column found.'
TARGET = price_cols[0]

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print('num:', num_features)
print('cat sample:', cat_features[:8])

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_pipeline = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
preproc = ColumnTransformer([('num', num_pipeline, num_features), ('cat', cat_pipeline, cat_features)], remainder='drop')

models = {
    'LinearRegression': (LinearRegression(), {}),
    'DecisionTree': (DecisionTreeRegressor(random_state=42), {'model__max_depth':[4,8,12,None]}),
    'RandomForest': (RandomForestRegressor(random_state=42, n_jobs=-1), {'model__n_estimators':[100], 'model__max_depth':[None,12]}),
    'GradientBoosting': (GradientBoostingRegressor(random_state=42), {'model__n_estimators':[100], 'model__max_depth':[2,3], 'model__learning_rate':[0.05,0.1]})
}

results=[]; best_models={}
for name,(est,grid) in models.items():
    print('\nTraining', name)
    pipe = Pipeline([('preprocess', preproc), ('model', est)])
    if grid:
        gs = GridSearchCV(pipe, param_grid=grid, cv=4, scoring='r2', n_jobs=-1, verbose=0)
        gs.fit(X_train, y_train)
        best = gs.best_estimator_; cv_score = gs.best_score_; best_params = gs.best_params_
    else:
        pipe.fit(X_train, y_train); best = pipe; cv_score=None; best_params={}
    y_pred = best.predict(X_test)
    r2 = r2_score(y_test, y_pred); mse = mean_squared_error(y_test, y_pred); rmse = math.sqrt(mse)
    results.append({'Model':name,'CV_R2':cv_score,'Test_R2':r2,'Test_RMSE':rmse,'Best_Params':best_params})
    best_models[name] = best

results_df = pd.DataFrame(results).sort_values('Test_R2',ascending=False).reset_index(drop=True)
display(results_df)
results_df.to_csv(RESULTS_CSV, index=False)
best_name = results_df.iloc[0]['Model']
best_pipeline = best_models[best_name]
joblib.dump(best_pipeline, MODEL_OUT)
print('Saved model:', MODEL_OUT, ' Best:', best_name)


## Feature importances (if available)

In [None]:

try:
    pre = best_pipeline.named_steps['preprocess']
    ohe = pre.named_transformers_['cat'].named_steps['onehot']
    cat_names = list(ohe.get_feature_names_out(cat_features)) if len(cat_features)>0 else []
except Exception:
    cat_names = []

all_names = num_features + cat_names
importance = None
try:
    model_inner = best_pipeline.named_steps['model']
    if hasattr(model_inner, 'feature_importances_'):
        importance = model_inner.feature_importances_
    elif hasattr(model_inner, 'coef_'):
        importance = np.abs(model_inner.coef_).ravel()
except Exception as e:
    print('Importance extraction failed:', e)

if importance is not None and len(importance)==len(all_names):
    imp_df = pd.DataFrame({'Feature':all_names,'Importance':importance}).sort_values('Importance',ascending=False).head(25)
    display(imp_df)
    path = os.path.join(PLOTS_DIR, f'feature_importances_{best_name}.png')
    plt.figure(figsize=(8,6)); plt.barh(imp_df['Feature'], imp_df['Importance']); plt.gca().invert_yaxis(); plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()
    print('Saved importances to', path)
else:
    print('No importances available or length mismatch.')


## Conclusion
The notebook saved cleaned data, model pipeline, and results. You can further tune models or export as PDF.

In [None]:

print('Done. Files created:')
print('- Cleaned CSV:', CLEANED_CSV)
print('- Results CSV:', RESULTS_CSV)
print('- Model pipeline:', MODEL_OUT)
print('- Plots dir:', PLOTS_DIR)
