# 02 - Modelo de RegressÃ£o para PreÃ§o

## ðŸ“Œ Objective

Build and evaluate machine learning models to predict smartphone prices (USD)
based on technical specifications, and analyze which features most influence pricing.


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/smartphones.csv')

INR_TO_USD = 0.012
df['price_usd'] = df['price_inr'] * INR_TO_USD

X = df.select_dtypes(include=['int64', 'float64']).drop(['price_inr', 'price_usd'], axis=1)
y = df['price_usd']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, pred_rf)

mae_rf


46.733911380325885

In [23]:
feature_importance = (
    pd.Series(rf.feature_importances_, index=X.columns)
      .sort_values(ascending=False)
)

feature_importance.head(10)


clock_speed_ghz         0.576268
rating_score            0.152440
display_inches          0.118602
battery_mah             0.034341
storage_gb              0.029160
charging_watt           0.026908
res_width_px            0.026711
front_camera_main_mp    0.010633
res_height_px           0.009643
ram_gb                  0.005373
dtype: float64

The model shows that CPU clock speed, user rating, and display size
are the most influential features in smartphone pricing.


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

lr_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('model', LinearRegression())
])

lr_pipeline.fit(X_train, y_train)

pred_lr = lr_pipeline.predict(X_test)
mae_lr = mean_absolute_error(y_test, pred_lr)

mae_lr


105.53085259762565