# Arboles de regresion 

In [1]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

In [3]:
df = pd.read_csv('Advertising.csv')

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

possible_targets = ['Sales', 'sales']
possible_features = ['TV', 'Radio', 'Newspaper', 'tv', 'radio', 'newspaper']

found_target = next((c for c in possible_targets if c in df.columns), None)
if found_target is None:
    raise ValueError("No se encontró columna objetivo automática (busqué 'Sales' o 'sales'). Define target_col manualmente.")
else:
    target_col = found_target

features = [c for c in possible_features if c in df.columns]
if len(features) == 0:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    features = [c for c in num_cols if c != target_col]
    if len(features) == 0:
        raise ValueError("No se encontraron features numéricas para usar en la regresión. Revisa el CSV o define `features` manualmente.")

print(f"Usando target: {target_col}")
print(f"Usando features: {features}")

X = df[features].values
y = df[target_col].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

y_base = np.full_like(y_test, fill_value=y_train.mean(), dtype=float)
baseline_rmse = np.sqrt(mean_squared_error(y_test, y_base))

print(f"LinearRegression benchmark")
print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")
print(f"Baseline RMSE (predicción = media): {baseline_rmse:.4f}")

coef_pairs = list(zip(features, lr.coef_))
print("Coeficientes:")
for f, c in coef_pairs:
    print(f"  {f}: {c:.4f}")



Usando target: sales
Usando features: ['TV', 'radio', 'newspaper']
LinearRegression benchmark
RMSE: 1.7816
R^2: 0.8994
Baseline RMSE (predicción = media): 5.6315
Coeficientes:
  TV: 0.0447
  radio: 0.1892
  newspaper: 0.0028


In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, train_test_split


possible_targets = ['Sales', 'sales']
possible_features = ['TV', 'Radio', 'Newspaper', 'tv', 'radio', 'newspaper']

found_target = next((c for c in possible_targets if c in df.columns), None)
if found_target is None:
    raise ValueError("No se encontró columna objetivo automática (busqué 'Sales' o 'sales'). Define target_col manualmente.")
else:
    target_col = found_target

features = [c for c in possible_features if c in df.columns]
if len(features) == 0:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    features = [c for c in num_cols if c != target_col]
    if len(features) == 0:
        raise ValueError("No se encontraron features numéricas para usar en la regresión. Revisa el CSV o define `features` manualmente.")

X = df[features].values
y = df[target_col].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {'max_depth': [2, 3, 4, 5, 6, 8, 10, None], 'min_samples_leaf': [1, 2, 5]}
reg = DecisionTreeRegressor(random_state=42)
grid = GridSearchCV(reg, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=0)
grid.fit(X_train, y_train)

best = grid.best_estimator_
y_pred = best.predict(X_test)
r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'DecisionTreeRegressor - Test R^2: {r2_test:.4f}, RMSE: {rmse_test:.4f}')

try:
    print(f'LinearRegression - Test R^2: {r2:.4f}, RMSE: {rmse:.4f}')
except NameError:
    print('LinearRegression metrics no disponibles en el kernel actual. Ejecuta la celda de regresión benchmark primero si quieres comparar.')

if hasattr(best, 'feature_importances_'):
    importances = best.feature_importances_
    print('Importancia de features:')
    for f, imp in zip(features, importances):
        print(f'  {f}: {imp:.4f}')

best_tree_model = best


DecisionTreeRegressor - Test R^2: 0.9353, RMSE: 1.4288
LinearRegression - Test R^2: 0.8994, RMSE: 1.7816
Importancia de features:
  TV: 0.6071
  radio: 0.3788
  newspaper: 0.0141


In [9]:
if 'X_test' not in globals() or 'y_test' not in globals():
    possible_targets = ['Sales', 'sales']
    found_target = next((c for c in possible_targets if c in df.columns), None)
    if found_target is None:
        raise ValueError("No se encontró la columna objetivo para comparar. Ejecuta las celdas previas o define `target_col` y split manualmente.")
    target_col = found_target
    possible_features = ['TV', 'Radio', 'Newspaper', 'tv', 'radio', 'newspaper']
    features = [c for c in possible_features if c in df.columns]
    if len(features) == 0:
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        features = [c for c in num_cols if c != target_col]
    X = df[features].values
    y = df[target_col].values
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

r2_lr = None
if 'r2' in globals():
    r2_lr = r2
elif 'lr' in globals():
    try:
        r2_lr = r2_score(y_test, lr.predict(X_test))
    except Exception:
        r2_lr = None

r2_tree = None
if 'r2_test' in globals():
    r2_tree = r2_test
elif 'best_tree_model' in globals():
    try:
        r2_tree = r2_score(y_test, best_tree_model.predict(X_test))
    except Exception:
        r2_tree = None

print('Comparación de R^2:')
print(f"LinearRegression R^2: {r2_lr:.4f}" if r2_lr is not None else "LinearRegression R^2: no disponible")
print(f"DecisionTreeRegressor R^2: {r2_tree:.4f}" if r2_tree is not None else "DecisionTreeRegressor R^2: no disponible")


Comparación de R^2:
LinearRegression R^2: 0.8994
DecisionTreeRegressor R^2: 0.9353


Al comparar los dos modelos, se ve que el árbol de decisión tiene un mejor resultado que la regresión lineal, con un R2 de 0.935 frente a 0.899. Esto quiere decir que el árbol capta mejor las relaciones no lineales entre la inversión en ‘TV’, ‘radio’ y ‘newspaper’ con las ventas.

In [18]:
from sklearn.model_selection import train_test_split, GridSearchCV

import matplotlib.pyplot as plt

possible_targets = ['Sales', 'sales']
possible_features = ['TV', 'Radio', 'Newspaper', 'tv', 'radio', 'newspaper']

found_target = next((c for c in possible_targets if c in df.columns), None)
if found_target is None:
    raise ValueError("No se encontró columna objetivo automática (busqué 'Sales'/'sales'). Define `target_col` manualmente.")
target_col = found_target

features = [c for c in possible_features if c in df.columns]
if len(features) == 0:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    features = [c for c in num_cols if c != target_col]
    if len(features) == 0:
        raise ValueError("No se encontraron features. Define `features` manualmente.")



X = df[features].values
y = df[target_col].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr_train = lr.predict(X_train)
y_pred_lr_test = lr.predict(X_test)

r2_lr_train = r2_score(y_train, y_pred_lr_train)
r2_lr_test = r2_score(y_test, y_pred_lr_test)
rmse_lr_test = np.sqrt(mean_squared_error(y_test, y_pred_lr_test))

param_grid = {'max_depth': [2, 3, 4, 5, 6, None], 'min_samples_leaf': [1, 2, 5]}
dt = DecisionTreeRegressor(random_state=42)
grid = GridSearchCV(dt, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
best_tree = grid.best_estimator_

y_pred_tree_train = best_tree.predict(X_train)
y_pred_tree_test = best_tree.predict(X_test)

r2_tree_train = r2_score(y_train, y_pred_tree_train)
r2_tree_test = r2_score(y_test, y_pred_tree_test)
rmse_tree_test = np.sqrt(mean_squared_error(y_test, y_pred_tree_test))

print(f"LinearRegression  - R^2 train: {r2_lr_train:.4f}, R^2 test: {r2_lr_test:.4f}, RMSE test: {rmse_lr_test:.4f}")
print(f"DecisionTree      - R^2 train: {r2_tree_train:.4f}, R^2 test: {r2_tree_test:.4f}, RMSE test: {rmse_tree_test:.4f}")
print("")
print("Decision Tree - CV R^2 (best_score_):", grid.best_score_)


LinearRegression  - R^2 train: 0.8957, R^2 test: 0.8994, RMSE test: 1.7816
DecisionTree      - R^2 train: 1.0000, R^2 test: 0.9311, RMSE test: 1.4748

Decision Tree - CV R^2 (best_score_): 0.9379917962036435


El árbol generaliza mejor que la regresión lineal: hay señales de relaciones no lineales capturadas por el árbol

In [20]:
from sklearn.preprocessing import PolynomialFeatures

possible_targets = ['Sales', 'sales']
possible_features = ['TV', 'Radio', 'Newspaper', 'tv', 'radio', 'newspaper']

found_target = next((c for c in possible_targets if c in df.columns), None)
if found_target is None:
    raise ValueError("No se encontró columna objetivo automática (busqué 'Sales'/'sales'). Define `target_col` manualmente.")
target_col = found_target

features = [c for c in possible_features if c in df.columns]
if len(features) == 0:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    features = [c for c in num_cols if c != target_col]

X = df[features].values
y = df[target_col].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2, include_bias=False)
Xp_train = poly.fit_transform(X_train)
Xp_test = poly.transform(X_test)

lr_poly = LinearRegression()
lr_poly.fit(Xp_train, y_train)
y_pred_lr_poly_train = lr_poly.predict(Xp_train)
y_pred_lr_poly_test = lr_poly.predict(Xp_test)

r2_lr_poly_train = r2_score(y_train, y_pred_lr_poly_train)
r2_lr_poly_test = r2_score(y_test, y_pred_lr_poly_test)
rmse_lr_poly_test = np.sqrt(mean_squared_error(y_test, y_pred_lr_poly_test))

param_grid = {'max_depth': [2, 3, 4, 5, 6, None], 'min_samples_leaf': [1, 2, 5]}
dt = DecisionTreeRegressor(random_state=42)
grid_poly = GridSearchCV(dt, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_poly.fit(Xp_train, y_train)
best_tree_poly = grid_poly.best_estimator_

y_pred_tree_poly_train = best_tree_poly.predict(Xp_train)
y_pred_tree_poly_test = best_tree_poly.predict(Xp_test)

r2_tree_poly_train = r2_score(y_train, y_pred_tree_poly_train)
r2_tree_poly_test = r2_score(y_test, y_pred_tree_poly_test)
rmse_tree_poly_test = np.sqrt(mean_squared_error(y_test, y_pred_tree_poly_test))

print("Polynomial Features (degree=2) results:")
print(f"LinearRegression (poly) - R^2 train: {r2_lr_poly_train:.4f}, R^2 test: {r2_lr_poly_test:.4f}, RMSE test: {rmse_lr_poly_test:.4f}")
print(f"DecisionTree (poly)     - R^2 train: {r2_tree_poly_train:.4f}, R^2 test: {r2_tree_poly_test:.4f}, RMSE test: {rmse_tree_poly_test:.4f}")
print("")
print("Decision Tree (poly) - CV R^2 (best_score_):", grid_poly.best_score_)



Polynomial Features (degree=2) results:
LinearRegression (poly) - R^2 train: 0.9861, R^2 test: 0.9869, RMSE test: 0.6426
DecisionTree (poly)     - R^2 train: 0.9877, R^2 test: 0.9771, RMSE test: 0.8493

Decision Tree (poly) - CV R^2 (best_score_): 0.9570073382800925
