In [2]:
# 参考:
# - https://github.com/ShaoQiBNU/GAUC 

from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import random

# 构造数据
a = [random.randint(0,9) for i in range(30)]
b = [random.randint(0,1) for i in range(30)]
c = [random.random() for i in range(30)]
d = [random.random()*10 for i in range(30)]

data = {"user_id": a, "label": b, "prob": c, "label2": d} # label可以是click or not, label2可以是 ts
df = pd.DataFrame(data)
df

Unnamed: 0,user_id,label,prob,label2
0,9,0,0.376358,8.895656
1,5,1,0.83935,1.724401
2,1,0,0.784684,4.530036
3,8,1,0.581053,7.693924
4,4,1,0.445031,8.9561
5,0,0,0.286642,5.436928
6,2,0,0.319409,5.640808
7,5,1,0.67236,7.440818
8,1,1,0.347288,0.268131
9,4,1,0.319993,0.928787


In [3]:
# 分组方法：搜索中一般基于query pv量进行分组(因为依据独立query/独立user进行分组会非常稀释)；推荐一般是基于 独立user

def gauc(df, is_spearmanr = True):
    df_temp = df.groupby(['user_id'])
    roc = 0
    impression = 0
    for key, value in df_temp:
        if value.size <=1 or len(list(value.iloc[:, 1].unique())) <=1: # 分组内样本数等于1 或 全为正/负样本，则丢掉当前分组
            continue
        else:
            if is_spearmanr:
                auc = roc_auc_score(value.iloc[:, 1], value.iloc[:, 2]) # 正常二分类计算auc，比如 针对clck
            else:
                _, auc = auc_custom(value.iloc[:, 1].tolist(), value.iloc[:, 2].tolist()) # 非二分类的auc，比如 针对 ts (即快手提出的所谓的xAUC)
            
            roc += value.size * auc # 分组内w * 分组auc (w是分组内的样本数目。也可以换成分组内样本的sigma_ts记得到ts_weighted gauc，等等)
            impression += value.size # 总体w
            
    return roc / impression

# 自定义的xAUC = 正序对/all_pair_num
def auc_custom(labels, probs):
    n = len(labels)
    f = list(zip(probs, labels))

    rank_pos = [values2 for values1, values2 in sorted(f, key=lambda x: x[0], reverse=True)]
    pos_num = reversePairs(rank_pos) # 所谓逆序对：两个列表中相对顺序保持一致的元素对数量(y_pred和y_true 两个列表)

    auc = pos_num / (n * (n - 1) / 2)

    return pos_num, auc

# 基于快排partition()/二分查找 计算逆序对, o(n)
def reversePairs(data): 
    ans = 0
    prefix = []
    for n in data:
        left, right = 0, len(prefix)
        while left < right:
            mid = (left + right) // 2
            if n >= prefix[mid]:
                left = mid + 1
            else:
                right = mid
        ans += len(prefix) - left
        prefix[left:left] = [n]

    return ans

In [4]:
gauc(df[["user_id", "label", "prob"]])

0.7058823529411765

In [5]:
gauc(df[["user_id", "label2", "prob"]], is_spearmanr = False)

0.5044444444444445