In [1]:
"""
特征工程Demo -- 把一个主体（人或卡片或其他）的若干条事件groupby到一起，计算出若干个特征

数据说明：这是一份人工生成的通话记录数据，记录了一些用户在一段时间内的通话行为

字段说明：
user_id--用户唯一标识
event_type--事件类型
start_time--通话开始时间
calling_duration--通话持续时长，单位是秒
phone--对方电话号码
phone_location--对方地址

"""

'\n\xe7\x89\xb9\xe5\xbe\x81\xe5\xb7\xa5\xe7\xa8\x8bDemo -- \xe6\x8a\x8a\xe4\xb8\x80\xe4\xb8\xaa\xe4\xb8\xbb\xe4\xbd\x93\xef\xbc\x88\xe4\xba\xba\xe6\x88\x96\xe5\x8d\xa1\xe7\x89\x87\xe6\x88\x96\xe5\x85\xb6\xe4\xbb\x96\xef\xbc\x89\xe7\x9a\x84\xe8\x8b\xa5\xe5\xb9\xb2\xe6\x9d\xa1\xe4\xba\x8b\xe4\xbb\xb6groupby\xe5\x88\xb0\xe4\xb8\x80\xe8\xb5\xb7\xef\xbc\x8c\xe8\xae\xa1\xe7\xae\x97\xe5\x87\xba\xe8\x8b\xa5\xe5\xb9\xb2\xe4\xb8\xaa\xe7\x89\xb9\xe5\xbe\x81\n\n\xe6\x95\xb0\xe6\x8d\xae\xe8\xaf\xb4\xe6\x98\x8e\xef\xbc\x9a\xe8\xbf\x99\xe6\x98\xaf\xe4\xb8\x80\xe4\xbb\xbd\xe4\xba\xba\xe5\xb7\xa5\xe7\x94\x9f\xe6\x88\x90\xe7\x9a\x84\xe9\x80\x9a\xe8\xaf\x9d\xe8\xae\xb0\xe5\xbd\x95\xe6\x95\xb0\xe6\x8d\xae\xef\xbc\x8c\xe8\xae\xb0\xe5\xbd\x95\xe4\xba\x86\xe4\xb8\x80\xe4\xba\x9b\xe7\x94\xa8\xe6\x88\xb7\xe5\x9c\xa8\xe4\xb8\x80\xe6\xae\xb5\xe6\x97\xb6\xe9\x97\xb4\xe5\x86\x85\xe7\x9a\x84\xe9\x80\x9a\xe8\xaf\x9d\xe8\xa1\x8c\xe4\xb8\xba\n\n\xe5\xad\x97\xe6\xae\xb5\xe8\xaf\xb4\xe6\x98\x8e\xef\xbc\x9a\nuser_id--\xe

In [2]:
# 拓宽notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import json
import time
import numpy as np
import math
from scipy.stats import describe

import pandas as pd
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns', None)

In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, StorageLevel
from pyspark.sql.types import *
from pyspark.sql import functions

conf = SparkConf().setMaster("yarn-client").setAppName("feature multi")  # 集群模式
# conf = SparkConf().setMaster("local[*]").setAppName("feature multi") # local模式
conf.set("spark.executor.instances", 10)
conf.set("spark.executor.memory", "5g")
conf.set("spark.executor.cores","1")
conf.set("spark.driver.memory", "5g")

spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc = spark.sparkContext

In [5]:
# 读数据
df = spark.read.parquet('/data/fresh_train/df_feature_multi_events')

In [6]:
# 数据总量
df.count()

16520084

In [7]:
# user_id数量
df.select("user_id").drop_duplicates().count()

14760

In [8]:
# 预览几条数据
df.limit(5).toPandas()

Unnamed: 0,user_id,event_type,start_time,calling_duration,phone,phone_location
0,32138179,phone_conversation,2017-12-20 10:05:50,34,15170527152,江西-宜春市
1,28925593,phone_conversation,2017-11-20 09:59:45,28,13529806351,云南-红河哈尼族彝族自治州
2,31977400,phone_conversation,2018-03-09 09:55:34,94,15278494900,广西-梧州市
3,2331449,phone_conversation,2018-01-23 14:39:30,22,15060042522,福建-福州市
4,2472569,phone_conversation,2018-04-15 17:45:27,18,13731274261,河北-保定市


In [9]:
# 新增一列：年月，并统计这一列的分布
df.withColumn("time_date", functions.col("start_time").substr(0, 7)
              ).select("time_date").rdd.map(lambda x: x[0]).countByValue()

defaultdict(int,
            {u'': 236,
             u'2017-01': 329,
             u'2017-02': 138,
             u'2017-03': 85,
             u'2017-04': 123,
             u'2017-05': 401,
             u'2017-06': 125,
             u'2017-07': 134,
             u'2017-08': 780,
             u'2017-09': 10040,
             u'2017-10': 508272,
             u'2017-11': 2950767,
             u'2017-12': 3111841,
             u'2018-01': 2951923,
             u'2018-02': 2842726,
             u'2018-03': 2903025,
             u'2018-04': 1239139})

### 任务1：把user_id为31977400的用户的所有事件collect回来，保存成events和schema
events是一个list的list，内层的每个list表示一条事件; schema是字段列表，即 df.columns

In [10]:
events = df.filter(functions.col("user_id")=="31977400").rdd.map(list).collect()
schema = df.columns

In [11]:
len(events), schema

(155,
 ['user_id',
  'event_type',
  'start_time',
  'calling_duration',
  'phone',
  'phone_location'])

In [12]:
events[:3]

[[u'31977400',
  u'phone_conversation',
  u'2018-03-09 09:55:34',
  94,
  u'15278494900',
  u'\u5e7f\u897f-\u68a7\u5dde\u5e02'],
 [u'31977400',
  u'phone_conversation',
  u'2018-03-03 07:14:40',
  19,
  u'18277497807',
  u'\u5e7f\u897f-\u68a7\u5dde\u5e02'],
 [u'31977400',
  u'phone_conversation',
  u'2018-03-11 20:19:23',
  13,
  u'15278494900',
  u'\u5e7f\u897f-\u68a7\u5dde\u5e02']]

### 任务2：写一个函数，计算一个数值序列的各种统计值，包括但不限于 avg std max min 分位数
输入：数值list ；输出：特征dict，每个特征是一个key

In [30]:
def num_stat(ls):
    """ 数值序列的特征函数 """
    ret = dict()
    ret['Avg'] = -1
    ret['Std'] = -1
    ret['Max'] = -1
    ret['Min'] = -1
    ret['Sum'] = -1
    perc = [25, 50, 75]
    for i, p in enumerate(perc):
        ret['Quar%s' % p] = -1
    ret['Iqr'] = -1
    
    if ls:
        arr = np.array(ls)
        desc = describe(arr)
        cnt = desc.nobs

        ret['Avg'] = desc.mean
        if not math.isnan(desc.variance):
            ret['Std'] = np.sqrt(desc.variance)
        else:
            ret["Std"] = -1
        ret['Max'] = desc.minmax[1]
        ret['Min'] = desc.minmax[0]
        ret['Sum'] = desc.mean * cnt
        perc = [25, 50, 75]
        perc_values = np.percentile(arr, perc)
        for i, p in enumerate(perc):
            ret['Quar%s' % p] = perc_values[i]
        ret['Iqr'] = ret['Quar%s' % 75] - ret['Quar%s' % 25]
        
    return dict([(x, float(y)) for x,y in ret.items()])

In [33]:
num_stat([1,2,3,4])

{'Avg': 2.5,
 'Iqr': 1.5,
 'Max': 4.0,
 'Min': 1.0,
 'Quar25': 1.75,
 'Quar50': 2.5,
 'Quar75': 3.25,
 'Std': 1.2909944487358056,
 'Sum': 10.0}

In [44]:
num_stat([])

{'Avg': -1.0,
 'Iqr': -1.0,
 'Max': -1.0,
 'Min': -1.0,
 'Quar25': -1.0,
 'Quar50': -1.0,
 'Quar75': -1.0,
 'Std': -1.0,
 'Sum': -1.0}

### 任务3：写一个函数，计算一个类别序列的信息熵、众数、取值个数以及histogram的数值统计值（利用任务2）

In [41]:
def cat_stat(ls):
    """ 类别序列的特征函数 """
    ret = dict()
    _ , value_cnt, most_common_values, histo = counter(ls)
    histo_values = histo.values()
    if most_common_values:
        ret['Mode'] = most_common_values.pop()
    else:
        ret['Mode'] = ''
    ret['Cnt'] = value_cnt
    ret['Entropy'] = get_entropy(histo_values)
    ret.update(num_stat(histo_values))
    
    return dict([("catstat_%s" % x, y) for x,y in ret.items()])


def counter(arr):
    """ 统计序列的直方图 """
    value_set = set()
    most_common_values = set()
    value_cnt = -1
    histo_values = []
    histo = dict()
    if not arr:
        return value_set, value_cnt, most_common_values, histo

    cnt_values_map = dict()  # 次数到值set的字典 为了一次遍历就得到众数的set
    cnt_values_map[0] = set(arr)  # 初始化
    most_commnt_cnt = 0  # 众数出现的次数
    for a in arr:
        value_set.add(a)
        if not a in histo:
            histo[a] = 1
        else:
            histo[a] = histo[a] + 1
        if not histo[a] in cnt_values_map:
            cnt_values_map[histo[a]] = set()
            cnt_values_map[histo[a]].add(a)
        else:
            cnt_values_map[histo[a]].add(a)
        if histo[a] > most_commnt_cnt:
            most_commnt_cnt = histo[a]
    most_common_values = cnt_values_map[most_commnt_cnt]
    value_cnt = len(value_set)
    return value_set, value_cnt, most_common_values, histo


def get_entropy(nums):
    """ 计算信息熵 """
    if not nums:
        return -1
    entro = 0.0
    total = sum(nums)
    if total <= 0.0:
        return -1
    for num in nums:
        p = 1.0 * num / total
        if p > 1e-5:
            entro += p * math.log(p)
    if entro != 0.0:
        entro = -entro
    return float('%.5f' % entro)

In [42]:
cat_stat(['a', 'b', 'a', 'c'])

{'catstat_Avg': 1.3333333333333333,
 'catstat_Cnt': 3,
 'catstat_Entropy': 1.03972,
 'catstat_Iqr': 0.5,
 'catstat_Max': 2.0,
 'catstat_Min': 1.0,
 'catstat_Mode': 'a',
 'catstat_Quar25': 1.0,
 'catstat_Quar50': 1.0,
 'catstat_Quar75': 1.5,
 'catstat_Std': 0.5773502691896257,
 'catstat_Sum': 4.0}

In [43]:
cat_stat([])

{'catstat_Avg': -1.0,
 'catstat_Cnt': -1,
 'catstat_Entropy': -1,
 'catstat_Iqr': -1.0,
 'catstat_Max': -1.0,
 'catstat_Min': -1.0,
 'catstat_Mode': '',
 'catstat_Quar25': -1.0,
 'catstat_Quar50': -1.0,
 'catstat_Quar75': -1.0,
 'catstat_Std': -1.0,
 'catstat_Sum': -1.0}

### 任务4：对 user_id为31977400 的用户，计算若干特征，用一个函数实现，其中包含若干子函数，每个子函数实现一类特征
#### 特征1：calling_duration字段的num_stat
#### 特征2：phone_location字段的cat_stat
#### 特征3：start_time字段按从小到大排序，计算相邻事件的时间差(单位是秒)，然后对时间差计算num_stat
#### 特征4：start_time在凌晨0-5点的事件的数量和占比
#### 特征5：把相同phone的通话时长calling_duration相加，得到每个phone的总通话时长，然后对这个序列计算num_stat

In [45]:
def feature1(events, schema, col):
    ind = schema.index(col)
    ret = num_stat([x[ind] for x in events])
    return dict([("%s_%s" % (col, x), y) for x,y in ret.items()])

def feature2(events, schema, col):
    ind = schema.index(col)
    ret = cat_stat([x[ind] for x in events])
    return dict([("%s_%s" % (col, x), y) for x,y in ret.items()])

def feature3(events, schema, col):
    ind = schema.index(col)
    time_ls = sorted([time.mktime(pd.to_datetime(x[ind]).timetuple()) for x in events])
    diff_time_ls = []
    for i,t in enumerate(time_ls):
        if i:
            diff_time_ls.append(time_ls[i] - time_ls[i-1])
    ret = num_stat(diff_time_ls)
    return dict([("%s_%s" % (col, x), y) for x,y in ret.items()])

def feature4(events, schema, col):
    ind = schema.index(col)
    hour_ls = [pd.to_datetime(x[ind]).hour for x in events]
    ret = dict()
    ret["hour0_5_eventcnt"] = len([1 for x in hour_ls if 0<=x<=5])
    ret["hour0_5_ratio"] = 1.0*ret["hour0_5_eventcnt"]/len(events)
    return ret
    
def feature5(events, schema, col_agg, col_stat):
    ind_agg = schema.index(col_agg)
    ind_stat = schema.index(col_stat)
    agg_dict = dict()
    for e in events:
        if e[ind_agg] not in agg_dict:
            agg_dict[e[ind_agg]] = 0
        agg_dict[e[ind_agg]] += e[ind_stat]
    ret = num_stat(agg_dict.values())
    return dict([("%s_%s_%s" % (col_agg, col_stat, x), y) for x,y in ret.items()])
    
def features(events, schema):
    ret = dict()
    ret.update(feature1(events, schema, "calling_duration"))
    ret.update(feature2(events, schema, "phone_location"))
    ret.update(feature3(events, schema, "start_time"))
    ret.update(feature4(events, schema, "start_time"))
    ret.update(feature5(events, schema, "phone", "calling_duration"))
    return ret


In [46]:
feature1(events, schema, "calling_duration")

{'calling_duration_Avg': 139.44516129032257,
 'calling_duration_Iqr': 118.5,
 'calling_duration_Max': 2270.0,
 'calling_duration_Min': 2.0,
 'calling_duration_Quar25': 29.0,
 'calling_duration_Quar50': 70.0,
 'calling_duration_Quar75': 147.5,
 'calling_duration_Std': 252.06925820937994,
 'calling_duration_Sum': 21614.0}

In [47]:
feature2(events, schema, "phone_location")

{'phone_location_catstat_Avg': 12.916666666666666,
 'phone_location_catstat_Cnt': 12,
 'phone_location_catstat_Entropy': 1.20883,
 'phone_location_catstat_Iqr': 5.0,
 'phone_location_catstat_Max': 100.0,
 'phone_location_catstat_Min': 1.0,
 'phone_location_catstat_Mode': u'\u5e7f\u897f-\u68a7\u5dde\u5e02',
 'phone_location_catstat_Quar25': 1.0,
 'phone_location_catstat_Quar50': 2.0,
 'phone_location_catstat_Quar75': 6.0,
 'phone_location_catstat_Std': 28.474736277279305,
 'phone_location_catstat_Sum': 155.0}

In [48]:
feature3(events, schema, "start_time")

{'start_time_Avg': 62882.94155844156,
 'start_time_Iqr': 44745.75,
 'start_time_Max': 4150847.0,
 'start_time_Min': 35.0,
 'start_time_Quar25': 1026.0,
 'start_time_Quar50': 7036.5,
 'start_time_Quar75': 45771.75,
 'start_time_Std': 338446.91394544084,
 'start_time_Sum': 9683973.0}

In [49]:
feature4(events, schema, "start_time")

{'hour0_5_eventcnt': 0, 'hour0_5_ratio': 0.0}

In [50]:
feature5(events, schema, "phone", "calling_duration")

{'phone_calling_duration_Avg': 771.9285714285714,
 'phone_calling_duration_Iqr': 442.75,
 'phone_calling_duration_Max': 5988.0,
 'phone_calling_duration_Min': 9.0,
 'phone_calling_duration_Quar25': 69.75,
 'phone_calling_duration_Quar50': 169.5,
 'phone_calling_duration_Quar75': 512.5,
 'phone_calling_duration_Std': 1470.062304766087,
 'phone_calling_duration_Sum': 21614.0}

In [51]:
features(events, schema)

{'calling_duration_Avg': 139.44516129032257,
 'calling_duration_Iqr': 118.5,
 'calling_duration_Max': 2270.0,
 'calling_duration_Min': 2.0,
 'calling_duration_Quar25': 29.0,
 'calling_duration_Quar50': 70.0,
 'calling_duration_Quar75': 147.5,
 'calling_duration_Std': 252.06925820937994,
 'calling_duration_Sum': 21614.0,
 'hour0_5_eventcnt': 0,
 'hour0_5_ratio': 0.0,
 'phone_calling_duration_Avg': 771.9285714285714,
 'phone_calling_duration_Iqr': 442.75,
 'phone_calling_duration_Max': 5988.0,
 'phone_calling_duration_Min': 9.0,
 'phone_calling_duration_Quar25': 69.75,
 'phone_calling_duration_Quar50': 169.5,
 'phone_calling_duration_Quar75': 512.5,
 'phone_calling_duration_Std': 1470.062304766087,
 'phone_calling_duration_Sum': 21614.0,
 'phone_location_catstat_Avg': 12.916666666666666,
 'phone_location_catstat_Cnt': 12,
 'phone_location_catstat_Entropy': 1.20883,
 'phone_location_catstat_Iqr': 5.0,
 'phone_location_catstat_Max': 100.0,
 'phone_location_catstat_Min': 1.0,
 'phone_locati

### 任务5：对数据集中的每一个用户都计算上面的所有特征，生成特征数据，每一条特征数据是一个tuple (user_id, features_dict)

In [52]:
rdd = df.rdd.map(list).persist()
schema = df.columns

In [53]:
ind_userid = schema.index("user_id")
rdd_feature = rdd.map(lambda x: (x[ind_userid], x)
       ).groupByKey().map(lambda x: (x[0], list(x[1]))
                         ).map(lambda x: (x[0], features(x[1], schema))
                              )

# 注意：这一步计算会比较耗时，可以进入yarn的管理页面，点击你的任务的ApplicationMaster，查看spark任务的执行进度
# 如何进入yarn页面，请询问你的导师或模型组其他同学

In [54]:
rdd_feature.take(2)

[(u'14049961',
  {'calling_duration_Avg': 87.07551020408164,
   'calling_duration_Iqr': 60.0,
   'calling_duration_Max': 1402.0,
   'calling_duration_Min': 1.0,
   'calling_duration_Quar25': 25.0,
   'calling_duration_Quar50': 43.0,
   'calling_duration_Quar75': 85.0,
   'calling_duration_Std': 131.96205546784938,
   'calling_duration_Sum': 128001.00000000001,
   'hour0_5_eventcnt': 10,
   'hour0_5_ratio': 0.006802721088435374,
   'phone_calling_duration_Avg': 707.1878453038674,
   'phone_calling_duration_Iqr': 304.0,
   'phone_calling_duration_Max': 20917.0,
   'phone_calling_duration_Min': 3.0,
   'phone_calling_duration_Quar25': 48.0,
   'phone_calling_duration_Quar50': 108.0,
   'phone_calling_duration_Quar75': 352.0,
   'phone_calling_duration_Std': 2124.400853591509,
   'phone_calling_duration_Sum': 128001.00000000001,
   'phone_location_catstat_Avg': 38.68421052631579,
   'phone_location_catstat_Cnt': 38,
   'phone_location_catstat_Entropy': 1.43676,
   'phone_location_catstat_I

### 任务6：把上面的特征rdd转换成spark dataframe, 保存到你自己的hdfs路径里

In [55]:
def to_json(x):
    x[1].update({"user_id": x[0]})
    return x[1]

df_feature = spark.createDataFrame(rdd_feature.map(to_json))

In [56]:
df_feature.limit(10).toPandas()

Unnamed: 0,calling_duration_Avg,calling_duration_Iqr,calling_duration_Max,calling_duration_Min,calling_duration_Quar25,calling_duration_Quar50,calling_duration_Quar75,calling_duration_Std,calling_duration_Sum,hour0_5_eventcnt,hour0_5_ratio,phone_calling_duration_Avg,phone_calling_duration_Iqr,phone_calling_duration_Max,phone_calling_duration_Min,phone_calling_duration_Quar25,phone_calling_duration_Quar50,phone_calling_duration_Quar75,phone_calling_duration_Std,phone_calling_duration_Sum,phone_location_catstat_Avg,phone_location_catstat_Cnt,phone_location_catstat_Entropy,phone_location_catstat_Iqr,phone_location_catstat_Max,phone_location_catstat_Min,phone_location_catstat_Mode,phone_location_catstat_Quar25,phone_location_catstat_Quar50,phone_location_catstat_Quar75,phone_location_catstat_Std,phone_location_catstat_Sum,start_time_Avg,start_time_Iqr,start_time_Max,start_time_Min,start_time_Quar25,start_time_Quar50,start_time_Quar75,start_time_Std,start_time_Sum,user_id
0,52.972067,43.5,409.0,1.0,15.0,29.0,58.5,67.814555,9482.0,2,0.011173,163.482759,153.5,2194.0,5.0,11.0,31.5,164.5,339.528943,9482.0,19.888889,9,1.28093,8.0,79.0,1.0,山东-菏泽市,1.0,7.0,9.0,31.470003,179.0,75462.365169,85709.75,908993.0,21.0,5161.5,32351.0,90871.25,115428.974369,13432301.0,31906889
1,88.105578,49.0,3047.0,0.0,15.0,29.5,64.0,234.121818,88458.0,10,0.00996,1360.892308,305.0,28920.0,0.0,18.0,55.0,323.0,4170.465251,88458.0,77.230769,13,1.09365,4.0,460.0,1.0,广东-汕尾市,1.0,3.0,5.0,162.017054,1004.0,14987.258225,13861.5,204422.0,22.0,920.0,3914.0,14781.5,24589.745648,15032220.0,32120390
2,91.478958,71.0,1339.0,1.0,21.0,43.0,92.0,154.135382,45648.0,12,0.024048,447.529412,191.5,9235.0,4.0,25.5,69.5,217.0,1374.27268,45648.0,17.821429,28,1.74177,2.5,177.0,1.0,广西-梧州市,1.0,2.0,3.5,45.430495,499.0,26035.475904,39566.5,224897.0,19.0,1242.0,8172.0,40808.5,35916.887516,12965667.0,17290906
3,120.868421,109.25,798.0,2.0,29.25,68.0,138.5,139.360582,13779.0,5,0.04386,287.0625,298.25,2087.0,9.0,37.25,122.0,335.5,404.634608,13779.0,8.142857,14,1.73478,8.0,53.0,1.0,云南-曲靖市,1.0,1.5,9.0,14.217309,114.0,134912.964602,119365.0,2160116.0,49.0,1423.0,28285.0,120788.0,316055.363873,15245165.0,21179422
4,54.708502,35.5,665.0,3.0,15.0,25.0,50.5,85.172732,13513.0,1,0.004049,81.89697,32.0,3298.0,3.0,15.0,25.0,47.0,324.794825,13513.0,11.761905,21,1.39787,2.0,140.0,1.0,上海-上海,1.0,1.0,3.0,32.534451,247.0,58472.142276,79202.75,515259.0,20.0,5807.0,45495.0,85009.75,66885.48711,14384147.0,20527433
5,31.425532,26.5,170.0,7.0,12.5,22.0,39.0,30.114813,1477.0,2,0.042553,52.75,31.25,268.0,7.0,11.0,21.0,42.25,75.472156,1477.0,2.764706,17,1.93959,0.0,24.0,1.0,河南-焦作市,1.0,1.0,1.0,5.607243,47.0,313333.304348,338504.25,1401165.0,247.0,88362.0,258285.5,426866.25,311385.89872,14413332.0,25265079
6,76.838523,65.0,1510.0,1.0,25.0,47.0,90.0,99.392178,106114.0,34,0.02462,624.2,233.5,19199.0,2.0,26.25,75.0,259.75,2016.10033,106114.0,47.62069,29,0.92605,2.0,1083.0,1.0,江苏-徐州市,1.0,2.0,3.0,200.854143,1381.0,11171.846377,8759.0,165027.0,15.0,579.0,2508.5,9338.0,21154.831464,15417148.0,22071921
7,79.40814,60.25,1661.0,1.0,22.0,42.0,82.25,139.100812,68291.0,32,0.037209,331.509709,142.0,19629.0,5.0,29.0,66.5,171.0,1507.035885,68291.0,50.588235,17,0.7282,6.0,706.0,1.0,内蒙古-包头市,1.0,2.0,7.0,170.380845,860.0,16793.610012,18839.5,540520.0,10.0,639.5,4346.0,19479.0,30831.636424,14425711.0,24321334
8,77.828638,51.0,1647.0,1.0,22.0,36.0,73.0,168.301805,33155.0,3,0.007042,325.04902,102.5,6641.0,6.0,26.25,50.5,128.75,1005.698992,33155.0,32.769231,13,1.15114,10.0,257.0,1.0,广东-湛江市,1.0,3.0,11.0,74.07671,426.0,33570.821176,44535.0,344325.0,16.0,1148.0,7727.0,45683.0,54016.172834,14267599.0,23715594
9,102.983108,62.25,3031.0,1.0,18.0,34.0,80.25,261.737484,60966.0,5,0.008446,429.338028,149.5,21419.0,1.0,23.0,55.0,172.5,1978.473392,60966.0,21.142857,28,1.5584,5.25,311.0,1.0,广东-广州市,1.0,2.0,6.25,63.575394,592.0,23127.813875,34400.5,289676.0,0.0,665.0,5469.0,35065.5,36391.399348,13668538.0,10216947


In [57]:
df_feature.write.mode("Overwrite").parquet("/data/fresh_train/df_feature_multi_events_answer")  # 填写你自己的路径

Py4JJavaError: An error occurred while calling o264.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:215)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:173)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:145)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.datasources.DataSource.writeInFileFormat(DataSource.scala:438)
	at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:474)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:610)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:233)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:217)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:509)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 24.0 failed 4 times, most recent failure: Lost task 4.3 in stage 24.0 (TID 297, dev-06-dev-ofc.ahi.internal, executor 8): org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:272)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:191)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:190)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 177, in main
    process()
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark2.2.0/python/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-53-fd6b5912efe1>", line 4, in <lambda>
  File "<ipython-input-45-8eae9f825a10>", line 44, in features
  File "<ipython-input-45-8eae9f825a10>", line 13, in feature3
  File "pandas/_libs/tslibs/nattype.pyx", line 59, in pandas._libs.tslibs.nattype._make_error_func.f
ValueError: NaTType does not support timetuple

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:315)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:258)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:256)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1375)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:261)
	... 8 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:188)
	... 45 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:272)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:191)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:190)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 177, in main
    process()
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark2.2.0/python/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-53-fd6b5912efe1>", line 4, in <lambda>
  File "<ipython-input-45-8eae9f825a10>", line 44, in features
  File "<ipython-input-45-8eae9f825a10>", line 13, in feature3
  File "pandas/_libs/tslibs/nattype.pyx", line 59, in pandas._libs.tslibs.nattype._make_error_func.f
ValueError: NaTType does not support timetuple

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:315)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:258)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:256)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1375)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:261)
	... 8 more


In [29]:
df_feature.write.parquet("...")  # 填写你自己的路径

Py4JJavaError: An error occurred while calling o158.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:215)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:173)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:173)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:145)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.datasources.DataSource.writeInFileFormat(DataSource.scala:438)
	at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:474)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:610)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:233)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:217)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:509)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 16.0 failed 4 times, most recent failure: Lost task 1.3 in stage 16.0 (TID 273, dev-09-dev-ofc.ahi.internal, executor 6): org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:272)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:191)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:190)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 177, in main
    process()
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark2.2.0/python/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-25-fd6b5912efe1>", line 4, in <lambda>
  File "<ipython-input-17-8eae9f825a10>", line 44, in features
  File "<ipython-input-17-8eae9f825a10>", line 18, in feature3
  File "<ipython-input-13-f5777cfa5573>", line 4, in num_stat
  File "/opt/pythonenvs-dev/ahi_data_analytics/lib/python2.7/site-packages/scipy/stats/stats.py", line 1255, in describe
    raise ValueError("The input must not be empty.")
ValueError: The input must not be empty.

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:315)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:258)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:256)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1375)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:261)
	... 8 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:188)
	... 45 more
Caused by: org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:272)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:191)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:190)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 177, in main
    process()
  File "/opt/spark2.2.0/python/pyspark/worker.py", line 172, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/opt/spark2.2.0/python/pyspark/serializers.py", line 268, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-25-fd6b5912efe1>", line 4, in <lambda>
  File "<ipython-input-17-8eae9f825a10>", line 44, in features
  File "<ipython-input-17-8eae9f825a10>", line 18, in feature3
  File "<ipython-input-13-f5777cfa5573>", line 4, in num_stat
  File "/opt/pythonenvs-dev/ahi_data_analytics/lib/python2.7/site-packages/scipy/stats/stats.py", line 1255, in describe
    raise ValueError("The input must not be empty.")
ValueError: The input must not be empty.

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:315)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:258)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:256)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1375)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:261)
	... 8 more


In [None]:
# 把你计算出的特征矩阵与参考答案做对比
df_feature_answer = spark.read.parquet("/data/fresh_train/df_feature_multi_events_answer")  # 读取参考答案
df_feature = spark.read.parquet("...")  # 读取你的答案

# 对比两个spark df  自己想办法解决吧~
