## Feature Engineering Demos

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

insurance_df = pd.read_csv("../Data/insurance.csv")

insurance_df.head()

In [0]:
def value_counter(dataframe):
    value_series = pd.DataFrame()
    for col in dataframe.select_dtypes(["object"]).columns:
        print(dataframe[col].value_counts())
             
value_counter(insurance_df)

In [0]:
def cat_plotter(data, target):
    import matplotlib.pyplot as plt
    for col in data.select_dtypes(["object"]).columns:
        sns.barplot(data=data, x=col, y=target)
        plt.xticks(rotation=45)
        plt.show()
        
cat_plotter(insurance_df, "charges")

In [0]:
region_dict = {
    "northwest": "west",
    "southwest": "west",
    "northeast": "east",
    "southeast": "east",
}

insurance_df["region_group"] = insurance_df["region"].map(region_dict)

insurance_df["region_group"].value_counts()

In [0]:
insurance_df["has_child"] = np.where(insurance_df["children"] > 0, 1, 0)

insurance_df["has_child"].value_counts()

In [0]:
insurance_df = insurance_df.drop(["region", "sex"], axis=1)

In [0]:
from sklearn.model_selection import train_test_split


insurance_df = (
    pd.get_dummies(insurance_df, drop_first=True)
    .assign(
        age_sq = insurance_df["age"]**2,
#         smoker_bmi_int = lambda x: x["smoker_yes"] * x["bmi"]
           )
)

X = sm.add_constant(insurance_df.drop(["charges"], axis=1))
y = insurance_df["charges"]

X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=2023)

X.head()

In [0]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_absolute_error as mae


kf = KFold(n_splits=5, shuffle=True, random_state=2023)

# Create a list to store validation scores for each fold
cv_lm_r2s = []
cv_lm_mae = []

# Loop through each fold in X and y
for train_ind, val_ind in kf.split(X, y):
    # Subset data based on CV folds
    X_train, y_train = X.iloc[train_ind], y.iloc[train_ind]
    X_val, y_val = X.iloc[val_ind], y.iloc[val_ind]
    # Fit the Model on fold's training data
    model = sm.OLS(y_train, X_train).fit()
    # Append Validation score to list 
    cv_lm_r2s.append(r2(y_val, model.predict(X_val),))
    cv_lm_mae.append(mae(y_val, model.predict(X_val),))

print("All Validation R2s: ", [round(x, 3) for x in cv_lm_r2s])
print(f"Cross Val R2s: {round(np.mean(cv_lm_r2s), 3)} +- {round(np.std(cv_lm_r2s), 3)}")

print("All Validation MAEs: ", [round(x, 3) for x in cv_lm_mae])
print(f"Cross Val MAEs: {round(np.mean(cv_lm_mae), 3)} +- {round(np.std(cv_lm_mae), 3)}")

In [0]:
model = sm.OLS(y, X).fit()

# print(f"Test R2: {r2(y_test, model.predict(X_test))}")
# print(f"Test MAE: {mae(y_test, model.predict(X_test))}")

In [0]:
model.summary()

In [0]:
orders = pd.DataFrame({
    "Customer_ID": [10001, 10001, 10002, 10002, 10003, 10004],
    "Order_Date": ["2023-01-25", "2023-06-13", "2023-02-02", "2023-02-02", "2023-05-15", "2023-02-17"],
    "Product_ID": ["GPQ-123", "QP3-111", "GPQ-123", "ACD-489", "GPQ-123", "QP3-123"],
    "Item_Price": [59.99, 79.99, 59.99, 19.99, 59.99, 79.99],
    "Quantity": [3, 5, 1, 2, 1, 4],
}).astype({"Order_Date": "datetime64"})

orders.head()

In [0]:
orders = orders.assign(
    total_value = orders["Item_Price"] * orders["Quantity"],
    days_since_last_purchase = (pd.Timestamp.today() - orders["Order_Date"]).dt.days,
)

orders.head()

In [0]:
orders.groupby("Customer_ID").agg(
    total_items_sold = ("Quantity", "sum"),
    average_value = ("total_value", "mean")
)

In [0]:
# def add_deviation_feature(X, feature, category):
    
#     # temp groupby object
#     category_gb = X.groupby(category)[feature]
    
#     # create columns of category means and standard deviations
#     category_mean = category_gb.transform(lambda x: x.mean())
#     category_std = category_gb.transform(lambda x: x.std())
    
#     # compute stds from category mean for each feature value,
#     # add to X as new feature
#     deviation_feature = (X[feature] - category_mean) / category_std 
#     X[feature + '_Dev_' + category] = deviation_feature


# add_deviation_feature(insurance_df, 'bmi', 'smoker')

insurance_df.tail()