In [29]:
import mlflow
import mlflow.sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
import dagshub

In [30]:
dagshub.init(repo_owner='iamprashantjain', repo_name='laptop_price_predictor_mlops', mlflow=True)
mlflow.set_experiment("Best Feature_Engg and Model combination")
mlflow.sklearn.autolog()

In [31]:
df = pd.read_csv(r"I:\CampusX_DS\campusx_dsmp2\9. MLOps revisited\laptop_price_predictor_mlops\laptop_data.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)

In [32]:
def fetch_processor(text):
    if text in ['Intel Core i7', 'Intel Core i5', 'Intel Core i3']:
        return text
    elif text.startswith('Intel'):
        return 'Other Intel Processor'
    else:
        return 'AMD Processor'


def cat_os(inp):
    if inp in ['Windows 10', 'Windows 7', 'Windows 10 S']:
        return 'Windows'
    elif inp in ['macOS', 'Mac OS X']:
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

In [33]:
def preprocess(df):
    df['Ram'] = df['Ram'].str.replace('GB', '', regex=False).astype('int32')
    df['Weight'] = df['Weight'].str.replace('kg', '', regex=False).astype('float32')

    df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in x else 0)
    df['Ips'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

    new_res = df['ScreenResolution'].str.split('x', n=1, expand=True)
    df['X_res'] = new_res[0].str.replace(',', '').str.extract(r'(\d+)').astype(int)
    df['Y_res'] = new_res[1].astype(int)

    df['ppi'] = (((df['X_res'] ** 2 + df['Y_res'] ** 2) ** 0.5) / df['Inches']).astype(float)

    df.drop(columns=['ScreenResolution', 'Inches', 'X_res', 'Y_res'], inplace=True)

    df['Cpu Name'] = df['Cpu'].apply(lambda x: " ".join(x.split()[0:3]))
    df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
    df.drop(columns=['Cpu', 'Cpu Name'], inplace=True)

    df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
    df['Memory'] = df['Memory'].str.replace('GB', '')
    df['Memory'] = df['Memory'].str.replace('TB', '000')

    mem_split = df['Memory'].str.split('+', n=1, expand=True)
    df['first'] = mem_split[0].str.strip()
    df['second'] = mem_split[1].fillna('0')

    for col in ['first', 'second']:
        df[f"{col}_HDD"] = df[col].apply(lambda x: 1 if 'HDD' in x else 0)
        df[f"{col}_SSD"] = df[col].apply(lambda x: 1 if 'SSD' in x else 0)
        df[col] = df[col].str.replace(r'\D', '', regex=True).astype(int)

    df['HDD'] = df['first'] * df['first_HDD'] + df['second'] * df['second_HDD']
    df['SSD'] = df['first'] * df['first_SSD'] + df['second'] * df['second_SSD']

    df.drop(columns=['Memory', 'first', 'second','first_HDD', 'first_SSD','second_HDD', 'second_SSD'], errors='ignore', inplace=True)

    df['Gpu brand'] = df['Gpu'].apply(lambda x: x.split()[0])
    df = df[df['Gpu brand'] != 'ARM']
    df.drop(columns=['Gpu'], inplace=True)

    df['os'] = df['OpSys'].apply(cat_os)
    df.drop(columns=['OpSys'], inplace=True)

    df.reset_index(drop=True, inplace=True)
    
    return df

In [34]:
df = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Gpu'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['os'] = df['OpSys'].apply(cat_os)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['OpSys'], inplace=True)


In [35]:
target = 'Price'
X = df.drop(columns=[target])
y = df[target]

In [36]:
numeric_features = ['Ram', 'Weight', 'ppi']
categorical_features = ['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os']

In [None]:
import mlflow.sklearn
import gc
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor


# Feature engineering options
feature_eng_options = {
    "base": ColumnTransformer([
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]),
    
    "with_storage": ColumnTransformer([
        ('num', StandardScaler(), numeric_features + ['HDD', 'SSD']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
}

# Model definitions and parameter grids
model_configs = {
    "RandomForest": RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
    "Ridge": Ridge(alpha=1.0),
    "XGBoost": XGBRegressor(n_estimators=50, max_depth=3, random_state=42, verbosity=0),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42)
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Start parent MLflow run
with mlflow.start_run(run_name="all_experiments") as parent_run:
    for fe_name, fe_transformer in feature_eng_options.items():
        for model_name, model in model_configs.items():
            with mlflow.start_run(run_name=f"{model_name} with {fe_name}", nested=True) as child_run:

                # Build pipeline
                pipe = Pipeline([
                    ("preprocess", fe_transformer),
                    ("model", model)
                ])

                # Fit model
                pipe.fit(X_train, y_train)

                # Predict and evaluate
                y_pred = pipe.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)

                # Log parameters
                mlflow.log_param("feature_engineering", fe_name)
                mlflow.log_param("model_name", model_name)
                mlflow.log_param("test_size", 0.2)

                # Log model-specific parameters
                if model_name == "RandomForest":
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("max_depth", model.max_depth)

                elif model_name == "Ridge":
                    mlflow.log_param("alpha", model.alpha)

                elif model_name == "XGBoost":
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("max_depth", model.max_depth)

                elif model_name == "GradientBoosting":
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("learning_rate", model.learning_rate)
                    mlflow.log_param("max_depth", model.max_depth)

                # Log metrics
                mlflow.log_metric("MAE", mae)
                mlflow.log_metric("R2", r2)

                # Log model
                mlflow.sklearn.log_model(pipe, "model")

                # Console output
                print(f"\nModel: {model_name} | Feature Eng: {fe_name}")
                print(f"MAE: {mae:.2f} | R2: {r2:.2f}")

                del pipe
                gc.collect()




Model: RandomForest | Feature Eng: base
MAE: 11310.37 | R2: 0.79


2025/05/29 21:54:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest with base at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/3d4cc70d09fe4e21829f3b3ddd4cbb2b.
2025/05/29 21:54:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.



Model: Ridge | Feature Eng: base
MAE: 14058.71 | R2: 0.72


2025/05/29 21:57:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run Ridge with base at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/65945211cfd94ce5b651eb0d44ba96a0.
2025/05/29 21:57:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.



Model: XGBoost | Feature Eng: base
MAE: 11571.34 | R2: 0.79


2025/05/29 22:00:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost with base at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/5ee5502282ad4580ae24efa76700ff7a.
2025/05/29 22:00:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.



Model: GradientBoosting | Feature Eng: base
MAE: 12442.77 | R2: 0.78


2025/05/29 22:03:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run GradientBoosting with base at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/ec167be1710742c694e937618f46c421.
2025/05/29 22:03:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.



Model: RandomForest | Feature Eng: with_storage
MAE: 10871.07 | R2: 0.81


2025/05/29 22:06:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest with with_storage at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/1047ed1e969e4f4e883b7afd390718b9.
2025/05/29 22:06:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.



Model: Ridge | Feature Eng: with_storage
MAE: 13319.44 | R2: 0.74


2025/05/29 22:09:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run Ridge with with_storage at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/f080cca1fc1a41b5a51eceb0d873a6e2.
2025/05/29 22:09:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.



Model: XGBoost | Feature Eng: with_storage
MAE: 11215.00 | R2: 0.81


2025/05/29 22:12:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost with with_storage at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/8d5775674800400783435d656cb7c341.
2025/05/29 22:12:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.



Model: GradientBoosting | Feature Eng: with_storage
MAE: 12233.45 | R2: 0.79


2025/05/29 22:16:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run GradientBoosting with with_storage at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/10c676bb9b224870ab112904b17f46e0.
2025/05/29 22:16:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.
2025/05/29 22:16:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run all_experiments at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2/runs/eefdbff2198946d089a80a0fbffed29b.
2025/05/29 22:16:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/laptop_price_predictor_mlops.mlflow/#/experiments/2.
