<a href="https://colab.research.google.com/github/krvax/API/blob/master/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mutual Information

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
plt.style.use("seaborn-v0_8-whitegrid")
df = pd.read_csv("../content/autos.csv")
df.head()

In [None]:
df.dtypes

In [None]:
# Para que la herramienta de MI funcione bien, las características de texto (como la marca) se convierten a números usando una técnica
# llamada "label encoding". Básicamente, se asigna un número diferente a cada categoría (ejemplo: "alfa-romeo" = 0, "audi" = 1, etc.).
# Después de esto, todas las características discretas se representan con números enteros.


X = df.copy()
y = X.pop("price")

# Label encoding for categoricals
for colname in X.select_dtypes("object"):
    X[colname], _ = X[colname].factorize()

In [None]:
X.dtypes

In [None]:
discrete_features = X.dtypes == int

In [None]:
print(discrete_features)

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores[::3]  # show a few features with their MI scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

In [None]:
sns.relplot(x="curb_weight", y="price", data=df);

In [None]:
sns.lmplot(x="horsepower", y="price", hue="fuel_type", data=df);

# Creating Features

In [None]:
df["stroke_ratio"] = df.stroke / df.bore

df[["stroke", "bore", "stroke_ratio"]].head()

In [None]:
df["displacement"] = (
    np.pi * ((0.5 * df.bore) ** 2) * df.stroke * df.num_of_cylinders
)

In [None]:
df["make_and_style"] = df["make"] + "_" + df["body_style"]
df[["make", "body_style", "make_and_style"]].head()

# Target Encoding

In [None]:
df["make_encoded"] = df.groupby("make")["price"].transform("mean")

df[["make", "price", "make_encoded"]].head(10)