In [2]:
# ---------- Imports ----------
import pandas as pd
import numpy as np
import gc
import mlflow
import mlflow.sklearn
import dagshub
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


# ---------- DagsHub + MLflow Setup ----------
dagshub.init(repo_owner='iamprashantjain', repo_name='laptop_price_predictor_mlops', mlflow=True)
mlflow.set_experiment("RandomForest_with_Storage_HPT")
mlflow.sklearn.autolog(disable=True)  # Disable autolog to avoid duplication


# ---------- Data Load ----------
df = pd.read_csv(r"I:\CampusX_DS\campusx_dsmp2\9. MLOps revisited\laptop_price_predictor_mlops\laptop_data.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)


# ---------- Preprocessing Functions ----------
def fetch_processor(text):
    if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
        return text
    elif text.startswith('Intel'):
        return 'Other Intel Processor'
    else:
        return 'AMD Processor'

def cat_os(inp):
    if inp in ['Windows 10', 'Windows 7', 'Windows 10 S']:
        return 'Windows'
    elif inp in ['macOS', 'Mac OS X']:
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

def preprocess(df):
    df['Ram'] = df['Ram'].str.replace('GB', '', regex=False).astype('int32')
    df['Weight'] = df['Weight'].str.replace('kg', '', regex=False).astype('float32')

    df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
    df['Ips'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

    new_res = df['ScreenResolution'].str.split('x', n=1, expand=True)
    df['X_res'] = new_res[0].str.replace(',', '').str.extract(r'(\d+)').astype(int)
    df['Y_res'] = new_res[1].astype(int)
    df['ppi'] = (((df['X_res'] ** 2 + df['Y_res'] ** 2) ** 0.5) / df['Inches']).astype(float)
    df.drop(columns=['ScreenResolution', 'Inches', 'X_res', 'Y_res'], inplace=True)

    df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
    df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
    df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

    df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
    df['Memory'] = df['Memory'].str.replace('GB', '')
    df['Memory'] = df['Memory'].str.replace('TB', '000')

    mem_split = df['Memory'].str.split('+', n=1, expand=True)
    df['first'] = mem_split[0].str.strip()
    df['second'] = mem_split[1].fillna('0')

    for col in ['first', 'second']:
        df[f"{col}_HDD"] = df[col].apply(lambda x: 1 if 'HDD' in x else 0)
        df[f"{col}_SSD"] = df[col].apply(lambda x: 1 if 'SSD' in x else 0)
        df[col] = df[col].str.replace(r'\D', '', regex=True).astype(int)

    df['HDD'] = df['first'] * df['first_HDD'] + df['second'] * df['second_HDD']
    df['SSD'] = df['first'] * df['first_SSD'] + df['second'] * df['second_SSD']
    df.drop(columns=['Memory', 'first', 'second', 'first_HDD', 'first_SSD', 'second_HDD', 'second_SSD'], inplace=True)

    df['Gpu brand'] = df['Gpu'].apply(lambda x: x.split()[0])
    df = df[df['Gpu brand'] != 'ARM']
    df.drop(columns=['Gpu'], inplace=True)

    df['os'] = df['OpSys'].apply(cat_os)
    df.drop(columns=['OpSys'], inplace=True)

    df.reset_index(drop=True, inplace=True)
    return df


# ---------- Preprocess Data ----------
df = preprocess(df)
target = 'Price'
X = df.drop(columns=[target])
y = df[target]


# ---------- Feature Columns ----------
numeric_features = ['Ram', 'Weight', 'ppi', 'HDD', 'SSD']
categorical_features = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']


# ---------- Train-Test Split ----------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# ---------- Preprocessor ----------
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])


# ---------- Pipeline ----------
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])


# ---------- Hyperparameter Grid ----------
param_grid = {
    'model__n_estimators': [50, 100, 150],
    'model__max_depth': [10, 15, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}


# ---------- Grid Search with MLflow Logging ----------
with mlflow.start_run(run_name="RF_GridSearch_with_Storage") as run:
    grid = GridSearchCV(pipe, param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log best parameters and metrics
    mlflow.log_params(grid.best_params_)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Log model
    mlflow.sklearn.log_model(best_model, "best_model_rf_with_storage")

    print(f"\n✅ Best Parameters: {grid.best_params_}")
    print(f"📉 MAE: {mae:.2f}")
    print(f"📈 R² Score: {r2:.2f}")


2025/05/29 22:34:19 INFO mlflow.tracking.fluent: Experiment with name 'RandomForest_with_Storage_HPT' does not exist. Creating a new experiment.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Gpu'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['os'] = df['OpSys'].apply(cat_os)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['OpSys'], inplace=True)


Fitting 3 folds for each of 36 candidates, totalling 108 fits





✅ Best Parameters: {'model__max_depth': 15, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 100}
📉 MAE: 10455.49
📈 R² Score: 0.82


2025/05/29 22:38:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run RF_GridSearch_with_Storage at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/3/runs/e998071fc64a4b4d8365706bc6a6bd19.
2025/05/29 22:38:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/3.


In [3]:
print(grid.best_params_)

{'model__max_depth': 15, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 100}
