# An example of a use of groupby apply to apply a model to a set of individual groups

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

In [2]:
X, y, coef = make_regression(n_samples=100, 
                             n_features=10, 
                             n_informative=5, 
                             n_targets=1, 
                             bias=0.0, 
                             effective_rank=None, 
                             tail_strength=0.5, 
                             noise=100, 
                             shuffle=True, 
                             coef=True, 
                             random_state=42)

In [3]:
pd.DataFrame(coef, index=[f"feat_{x}" for x in range(0, coef.shape[0])]).T

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9
0,16.748258,0.0,0.0,63.643025,0.0,70.647573,0.0,10.456784,3.158614,0.0


In [4]:
df = (pd.DataFrame(X, columns=[f"feat_{x}" for x in range(0, X.shape[1])])
.merge(pd.DataFrame(y, columns=["target"]),
      left_index=True,
      right_index=True))

In [6]:
df.shape

(100, 11)

In [8]:
df["group"] = np.repeat(range(0, 10), 10)

In [12]:
df.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,target,group
0,-0.92693,-1.430141,1.632411,-3.241267,-1.247783,-1.024388,0.130741,-0.059525,-0.252568,-0.440044,-186.494628,0
1,0.202923,0.334457,0.285865,1.547505,-0.387702,1.795878,2.010205,-1.515744,-0.612789,0.658544,191.976107,0
2,-0.241236,0.456753,0.342725,-1.251539,1.117296,1.443765,0.447709,0.352055,-0.082151,0.569767,315.503594,0
3,0.289775,-1.008086,-2.038125,0.871125,-0.408075,-0.326024,-0.351513,2.075401,1.201214,-1.870792,100.185659,0
4,-0.007973,-0.190339,-1.037246,0.077368,0.53891,-0.861284,-1.3828,1.479944,1.523124,-0.875618,-40.81308,0


In [32]:
def my_super_cool_function(df, feat_list, target):
    df = pd.DataFrame(MinMaxScaler().fit_transform(df), index=df.index, columns = df.columns)
    
    X = df[feat_list]
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
    
    model = LinearRegression()
    _ = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return X_test.assign(y_pred = y_pred, target=y_test)

In [28]:
features_list = (df
                 .filter(regex="feat")
                 .columns
                 .tolist()
                )

In [29]:
my_super_cool_function(df[df["group"]==0], feat_list = features_list, target = "target")

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,y_pred,target,group
1,0.544988,0.935186,0.633147,1.0,0.363659,1.0,1.0,0.127226,0.130375,0.829266,0.659026,0.753928,0.0
3,0.5769,0.223677,0.0,0.858757,0.355044,0.302377,0.303945,1.0,0.771786,0.0,0.391783,0.571078,0.0


In [50]:
df_results = (df
 .groupby("group")
 .apply(my_super_cool_function, features_list, "target")
)

In [69]:
# (df
#  .groupby("group")
# .describe())

In [55]:
def magnify():
    return [dict(selector="th",
                 props=[("font-size", "8pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

In [68]:
np.random.seed(25)
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

(df_results[["y_pred", "target"]]
 .assign(difference = lambda x: x.diff(axis=1)["target"])
 .abs()
 .style.background_gradient(cmap, axis=1)
    .set_caption("Hover to magnify")
    .set_table_styles(magnify()))

Unnamed: 0_level_0,Unnamed: 1_level_0,y_pred,target,difference
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5,0.167801,0.208114,0.040312
0,1,0.76428,0.753928,0.010352
1,19,0.666128,0.0,0.666128
1,12,0.700345,0.076113,0.624233
2,22,0.696564,0.0,0.696564
2,23,1.656731,0.247903,1.408828
3,36,0.062992,0.117258,0.18025
3,37,0.31892,0.393723,0.074803
4,43,0.559197,0.406571,0.152626
4,40,0.883836,0.019515,0.864321


In [42]:
# _ = plt.figure(figsize=(30, 10))
# _ = sns.heatmap(df
#                 .drop("target", axis=1)
#  .groupby("group")
# .describe())