In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
data_path = '/Users/ruining/Downloads/salesForecast'
order = pd.read_csv(data_path + '/t_order.csv')

In [4]:
def resample_date(start_date, end_date, gap, shop_id=None): 
    """
    针对order数据，提供开始日期，结束日期和采样周期，将自动在时间区间采样，
    采样周期内的sale值做sum处理变成一个点
    """
    start_sec = time.mktime(time.strptime(start_date,'%Y-%m-%d'))
    end_sec = time.mktime(time.strptime(end_date,'%Y-%m-%d'))
    period = int((end_sec - start_sec)/(24*60*60))+1

    # sift order models
    if shop_id is not None:
        order_sift = order[(order['ord_dt']>=start_date) & (order.ord_dt<=end_date)&(order.shop_id==shop_id)].loc[:, ['ord_dt', 'sale_amt', 'rtn_amt', 'shop_id']]
    else:
        order_sift = order[(order['ord_dt']>=start_date) & (order.ord_dt<=end_date)].loc[:, ['ord_dt', 'sale_amt', 'rtn_amt', 'shop_id']]
    order_sift = order_sift.groupby(['ord_dt']).sum()
    # sales - return money
    sales = order_sift.sale_amt - order_sift.rtn_amt
    sales_amt = pd.DataFrame({'sales': sales.values}, index=order_sift.index.values)
    date_range = pd.date_range(start_date, periods=period, feq='D')
    date_range = date_range.strftime('%Y-%m-%d')
    time_df = pd.DataFrame(index=date_range )

    # combine two dataframes and get the sum sale dataframe
    merge_df = pd.concat([sales_amt, time_df], axis=1).fillna(0)
    merge_df['group_index'] = gen_group_index(period, gap)
    merge_df = merge_df.groupby(['group_index']).sum()

    # generate new date index
    freq = '%sD' % gap
    period_index = pd.date_range(start_date, end_date, freq=freq)
    merge_df['period_index'] = period_index
    return merge_df

# 定义产生分组索引的函数，比如我们要计算的周期是 20 天，则按照日期，20 个交易日一组
def gen_group_index(total, group_len):
    """ generate an item group index array

    suppose total = 10, unitlen = 2, then we will return array [0 0 1 1 2 2 3 3 4 4]
    """

    group_count = total / group_len
    group_index = np.arange(total)
    for i in range(group_count):
        group_index[i * group_len: (i + 1) * group_len] = i
    group_index[(i + 1) * group_len : total] = i + 1
    return group_index.tolist()

In [8]:
print resample_date('2016-08-10', '2017-04-01', 4, shop_id=1300)

                sales period_index
group_index                       
0             2284.56   2016-08-10
1             2354.77   2016-08-14
2             3832.26   2016-08-18
3             3927.59   2016-08-22
4             3314.39   2016-08-26
5             6450.02   2016-08-30
6             5994.08   2016-09-03
7             3349.21   2016-09-07
8             4003.68   2016-09-11
9             1977.33   2016-09-15
10            5613.17   2016-09-19
11            1722.28   2016-09-23
12            3145.07   2016-09-27
13            2859.46   2016-10-01
14            5580.92   2016-10-05
15            4006.57   2016-10-09
16            5036.58   2016-10-13
17            3257.56   2016-10-17
18            4135.40   2016-10-21
19            6174.30   2016-10-25
20            5286.10   2016-10-29
21            9004.48   2016-11-02
22            6041.04   2016-11-06
23           11741.67   2016-11-10
24            8353.95   2016-11-14
25            9130.09   2016-11-18
26            7846.0