## 函数集

In [None]:
# 数据统计
def df_miss_rate(df):
    """ 计算每一列的缺失值数量和比例 """
    ret = dict()
    total = df.count()
    for col in df.columns:
        ret[col] = df.select(col).dropna().count()
    col_misscnt = [(x[0], total-x[1], 1-1.0*x[1]/total) for x in ret.items()]
    return pd.DataFrame(sorted(col_misscnt, key=lambda x: x[-1]), columns=['col', 'miss_cnt', 'miss_ratio'])

def df_col_value_distinct_cnt(df):
    """ 计算每列的取值数量 """
    ret = dict()
    for col in df.columns:
        ret[col] = df.select(col).distinct().count()
    return pd.DataFrame(sorted(ret.items(), key=lambda x: x[1]), columns=['col', 'value_cnt'])

def df_col_value_distribute(df, col, n=5):
    """ 计算给定字段里每个字段的取值和事件数字典，每个字段只取事件数最多的前n个值 """
    return df.select(col).groupby(col).count().sort('count', ascending=False).limit(n).toPandas()

### 衍生函数

In [2]:
from derive_package.ip import IPSeeker
from derive_package.ua_parser import user_agent_parser

def derive_ip_prefix(x, **kwargs):
    """ 取ip前缀"""
    len_list = kwargs.get('prefix', [20, 22, 24])
    ret = dict()
    try:
        if len(x) < 5:
            return dict([('prefix_%d' % x, '') for x in len_list])
        for length in len_list:
            mask = (1 << 32) - (1 << 32 - length)
            mask_l = [mask >> 24, (mask >> 16) & 255, (mask >> 8) & 255, mask & 255]
            ret.update({'prefix_%d' % length: '.'.join(map(lambda x: str(x[0] & x[1]), zip(map(int, x.split('.')), mask_l)))})
        return ret
    except:
        return dict([('prefix_%d' % x, None) for x in len_list])

def derive_ip(x, **kwargs):
    """ 衍生ip城市和运营商 """
    if not x:
        return {'city':None, 'carrier':None}
    else:
        path = kwargs.get('path')
        deriveip = IPSeeker.DeriveIp(path)
        try:
            city, carrier = deriveip.derive(x)
            return {'city':city, 'carrier':carrier}
        except:
            return {'city':None, 'carrier':None}
        
    
def derive_ua(x, **kwargs):
    """ 衍生ua """
    if not x:
        return {"device_family": None, 
                "os_family": None,
                "ua_family": None}
    else:
        ua_dict = user_agent_parser.Parse(x)
        return {"device_family": ua_dict['device']['family'], 
                "os_family": ua_dict['os']['family'],
                "ua_family": ua_dict['user_agent']['family']}
    
def df_derive(df, col, derive_func, **kwargs):
    """
    从spark dataframe的一列衍生出多列
    """
    derive_cols = sorted(derive_func(None,  **kwargs).keys())
    def udf_derive(derive_func, **kwargs):
        def f(x):
            derive_ret = derive_func(x, **kwargs)
            return u'\u0001'.join(["%s" % derive_ret[x] for x in derive_cols])
        return functions.udf(f, StringType())
    df_new = df.withColumn('derive', udf_derive(derive_func, **kwargs)(col)
                           ).withColumn("splitcol" ,functions.split(functions.col("derive"), u"\u0001")
                                        )
    for i ,c in enumerate(derive_cols):
        df_new = df_new.withColumn( col + '_' +c, functions.col("splitcol").getItem(i))
    df_new = df_new.drop('splitcol')
    df_new = df_new.drop('derive')
    return df_new

### 特征函数

In [None]:
import numpy as np
from collections import Counter
from scipy.stats import pearsonr, spearmanr, kendalltau, describe
import math
import bisect
import pandas as pd
import time

In [16]:
# 统计函数

common_default_value = -1e8  # 数值型特征的默认值

def category_stats(event_list, schema, **args):
    """ 
    类别和离散类字段的统计特征
    """
    feature_col = args['col']
    if not event_list:
        return cat_stat([], feature_col, **args)
    feature_col_idx = schema.index(feature_col)
    values = [x[feature_col_idx] for x in event_list if x[feature_col_idx]]  # 注意：这里过滤掉了''

    return cat_stat(values, feature_col)

def number_stats(event_list, schema, **args):
    """ 
    连续数值的统计特征
    """
    feature_col = args['col']
    if not event_list:
        return num_stat([], feature_col)
    feature_col_idx = schema.index(feature_col)
    values = [float(x[feature_col_idx]) for x in event_list if x[feature_col_idx]]  # 注意：这里过滤掉了''

    return num_stat(values, feature_col)

def num_stat(ls, feature_name=None):
    """
    数值型字段的统计特征

    :param feature_name: string, 特征名称前缀
    :param ls: list, 被统计的向量
    :param args type: simple or all, simple表示只计算简单统计量  all表示计算全部统计量
    :return: a dict of statistic parameter and value, e.g. {'feature1_avg.num':1.00,'feature1_std':0.23}
    """
    ret = {}
    ret['Avg'] = common_default_value
    ret['Std'] = common_default_value
    ret['Max'] = common_default_value
    ret['Min'] = common_default_value
    ret['Sum'] = common_default_value
    for i in [10, 25, 50, 75, 90]:
        ret['Quar%s' % i] = common_default_value
    ret['Ptp'] = common_default_value  # 极差 max-min
    ret['Cv'] = common_default_value  # 变异系数
    ret['Skew'] = common_default_value  # 偏度 偏度可能有正负 取值无界
    ret['Kur'] = common_default_value  # 峰度
    ret['MeanAbsDev'] = common_default_value  # 平均绝对离差
    ret['Iqr'] = common_default_value

    if ls:
        arr = np.array(ls)
        desc = describe(arr)
        cnt = desc.nobs
        ret['Avg'] = desc.mean
        if not math.isnan(desc.variance):
            ret['Std'] = np.sqrt(desc.variance)
        ret['Max'] = desc.minmax[1]
        ret['Min'] = desc.minmax[0]
        ret['Ptp'] = ret['Max'] - ret['Min']
        ret['Sum'] = desc.mean * cnt
        ret['Cv'] = 1.0 * ret['Std'] / (np.abs(ret['Avg']) + 1e-8)
        ret['Skew'] = desc.skewness
        ret['Kur'] = desc.kurtosis
        ret['MeanAbsDev'] = np.abs(arr - desc.mean).sum() / cnt
        perc = [10, 25, 50, 75, 90]
        perc_values = np.percentile(arr, perc)
        for i, p in enumerate(perc):
            ret['Quar%s' % p] = perc_values[i]
        ret['Iqr'] = ret['Quar%s' % 75] - ret['Quar%s' % 25]

    if feature_name:
        return dict([('%s_%s' % (feature_name, i), float('%.5f' % j)) for i, j in ret.items()])

    return  dict([(i,float('%.5f' % j))for i, j in ret.items()])


def cat_stat(ls, feature_name=None, **kwargs):
    """
    类别型字段的统计特征
    
    :param feature_name: string, 特征前缀 
    :param ls: list, 被统计的向量
    :param kwargs: no_set: Bool, 是否去掉类别特征  Tips:对于id类字段，建议设为True
                   no_mode: Bool, 是否去掉众数特征  Tips: 如果no_set和no_mode都设为True，将删掉所有类别型特征
    :return: dict，特征名:特征值
    """
    ret = {}
    # if not kwargs.get('no_set', False):
    #     ret['catstat_Set'] = []
    # if not kwargs.get('no_mode', False):
    #     ret['catstat_Mode'] = []
    ret['catstat_Cnt'] = -1
    ret['catstat_Entropy'] = -1
    ret['catstat_Gini'] = -1
    ret.update(num_stat([], 'catstat'))

    if ls:
        value_set, value_cnt, most_common_values, histo = counter(ls)
        histo_values = histo.values()
        # if not kwargs.get('no_set', False):
        #     ret['catstat_Set'] = list(value_set)
        # if not kwargs.get('no_mode', False):
        #     ret['catstat_Mode'] = list(most_common_values)
        ret['catstat_Cnt'] = value_cnt
        ret['catstat_Entropy'] = get_entropy(histo_values)
        ret['catstat_Gini'] = get_gini(histo_values)
        ret.update(num_stat(histo_values, 'catstat'))

    if feature_name:
        return dict([('%s_%s' % (feature_name, i),j) for i,j in ret.items()])

    return ret

def counter(arr):
    """
    计算数组的histogram mode count 等
    
    :param arr: 数组
    :return:  value_set: 不同的取值  value_cnt:不同的取值个数  most_common_values: 众数  histo:频数分布
    """
    value_set = set()
    most_common_values = set()
    value_cnt = -1
    histo = dict()  # 条形图
    if not arr:
        return value_set, value_cnt, most_common_values, histo

    cnt_values_map = dict()  # 次数到值set的字典 为了一次遍历就得到众数的set
    cnt_values_map[0] = set(arr)  # 初始化
    most_commnt_cnt = 0  # 众数出现的次数
    for a in arr:
        value_set.add(a)
        if not a in histo:
            histo[a] = 1
        else:
            histo[a] = histo[a] + 1
        if not histo[a] in cnt_values_map:
            cnt_values_map[histo[a]] = set()
            cnt_values_map[histo[a]].add(a)
        else:
            cnt_values_map[histo[a]].add(a)
        if histo[a] > most_commnt_cnt:
            most_commnt_cnt = histo[a]
    most_common_values = cnt_values_map[most_commnt_cnt]
    value_cnt = len(value_set)
    return value_set, value_cnt, most_common_values, histo


def get_entropy(nums):
    """
    计算信息熵
    
    Tips: 当负数和正数混合在一起时，total可能是负数，math.log 会报错 math domain error，
    对于包含负数的序列，无法直接计算熵，返回-1
    """

    if not nums:
        return common_default_value
    entro = 0.0
    total = sum(nums)
    if total <= 0.0:
        return common_default_value
    for num in nums:
        p = 1.0 * num / total
        if p > 1e-5:
            entro += p * math.log(p)
    if entro != 0.0:
        entro = -entro
    return float('%.5f' % entro)


def get_gini(counts):
    """ 
    计算gini不纯度
    """
    if not counts:
        return common_default_value
    gini = 0.0
    counts_sum = sum(counts)
    for i in counts:
        gini += (1.0 * i / counts_sum) ** 2
    gini = 1 - gini

    return float('%.5f' % gini)


In [None]:
# 时间特征
def get_time_delta_list(time_list, attr_name='second'):
    """ 
    计算时间序列的差值序列
    """
    deltas = []
    for i, j in enumerate(time_list):
        if i:
            deltas.append((j - time_list[i - 1]))
    if attr_name == 'second':
        return deltas
    elif attr_name == 'minute':
        return [1.0 * i / 60 for i in deltas]
    elif attr_name == 'hour':
        return [1.0 * i / 60 / 60 for i in deltas]
    elif attr_name == 'day':
        return [1.0 * i / 60 / 60 / 24 for i in deltas]
    return deltas

def timedelta_stat(event_list, schema, **args):
    """
    计算序列的时间间隔，再对时间间隔序列做数值统计
    """
    col_timestamp = args.get('col')  # mktime 转换后的float型时间戳
    ind_timestamp = schema.index(col_timestamp)
    unit = args.get('unit', 'second')
    return num_stat(get_time_delta_list([x[ind_timestamp] for x in event_list], unit), 
                    'timedelta_stat_unit_%s' % unit)

In [None]:
# 类别特征
def cat_set_feature(events, schema, **args):
    """
    类别型特征的取值直接作为特征
    """
    feature_col = args['col']
    if not events:
        return {"set_feature_%s" % feature_col: []}
    ind_col = schema.index(feature_col)
    return {"set_feature_%s" % feature_col: list(set([x[ind_col] for x in events]))}

def cat_mode_feature(events, schema, **args):
    """
    类别字段的众数特征

    注意：我们假定events是按照时间先后顺序排序好的，如果众数有多个，取最后出现的一个
    """
    feature_col = args['col']
    if not events:
        return {"mode_feature_%s" % feature_col: None}
    ind_col = schema.index(feature_col)
    mode_values = pd.Series([e[ind_col] for e in events]).mode().tolist()
    if len(mode_values)==1:
        return {"mode_feature_%s" % feature_col: mode_values[0]}
    else:
        mode_values_set = set(mode_values)
        for e in reversed(events):
            if e[ind_col] in mode_values_set:
                return {"mode_feature_%s" % feature_col: e[ind_col]}
    return {"mode_feature_%s" % feature_col: ""}

In [None]:
# 其他函数
def add_prefix_on_dict(d, prefix, delim='_'):
    """ 
    给dict 的每个key 加前缀
    """
    return dict([("%s%s%s" % (prefix, delim, i), j) for i,j in d.items()])

In [None]:
# 评估函数
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.utils.fixes import signature

class ClassificationEvaluate():
    """
    分类模型的评估指标
    """
    
    def precision_recall(self, y_trues, scores):
        assert len(y_trues)==len(scores)
        precision, recall, _ = precision_recall_curve(y_trues, scores)
        pr_auc = auc(recall, precision)
        step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
        plt.step(recall, precision, color='b', alpha=0.2, where='post', label='AUC = %0.5f' % pr_auc)
        plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.legend(loc="lower right")
        plt.title('Precision-Recall Curve')
        plt.show()
        
    def roc(self, y_trues, scores):
        assert len(y_trues)==len(scores)
        fpr, tpr, _ = roc_curve(y_trues, scores)
        roc_auc = auc(fpr, tpr)
        plt.figure()
        lw = 2
        plt.plot(fpr, tpr, color='darkorange',
                 lw=lw, label='AUC = %0.5f' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic Curve')
        plt.legend(loc="lower right")
        plt.show()