In [1]:
import pandas as pd
import numpy as np
import uuid

In [2]:
SUM_INTERVALS = [(-7,0), (-14,0), (-30,0)]

## Generate some random data

In [3]:
uids = np.array([str(uuid.uuid4()) for _ in range(1000)])
times = pd.date_range('2016-01-01','2016-02-01', freq='d')
data = dict(
    id=np.random.choice(uids, 100000),
    timestamp=np.random.choice(times, 100000),
    feature_a=np.ones(100000),
    feature_b=np.zeros(100000),
)

df = pd.DataFrame(data)\
        .set_index('timestamp')\
        .sort_index()

In [4]:
!mkdir data

In [5]:
for ts in times:
    path = "data/shard-{:%Y-%m-%d}".format(ts)
    df.loc[ts]\
        .reset_index()\
        .sample(frac=1)\
        .to_csv(path, index=False, compression='gzip')

In [6]:
del data, uids, times, df

In [7]:
!ls data

shard-2016-01-01 shard-2016-01-09 shard-2016-01-17 shard-2016-01-25
shard-2016-01-02 shard-2016-01-10 shard-2016-01-18 shard-2016-01-26
shard-2016-01-03 shard-2016-01-11 shard-2016-01-19 shard-2016-01-27
shard-2016-01-04 shard-2016-01-12 shard-2016-01-20 shard-2016-01-28
shard-2016-01-05 shard-2016-01-13 shard-2016-01-21 shard-2016-01-29
shard-2016-01-06 shard-2016-01-14 shard-2016-01-22 shard-2016-01-30
shard-2016-01-07 shard-2016-01-15 shard-2016-01-23 shard-2016-01-31
shard-2016-01-08 shard-2016-01-16 shard-2016-01-24 shard-2016-02-01


## Challenge:
The `SUM_INTERVAL` variable contains relative time intervals in days. So the first means 7 days back until today (asof time writing this 2016-02-01).

The generated files contain 1000 distinct users which made visits over the timespan of one month. Each visit has a value for `feature a`, as awell as `feature_b` assigned to it. **For each user calculate the sum of it's respective features for each time interval. The final output should be a dataframe or numpy matrix containing one row per user, an id column an the feature columns (N_users, 1 + N_intervals*N_features)**

*You are encouraged to use the pandas library for this task but it is not required.*

In [8]:
def bin_sum_features(csv_glob="data/shard-*.csv.gz", today=pd.Timestamp('2016-02-01')):
    pass

## Result shape example
below you see how the results could look like with pandas

In [9]:
pd.DataFrame([['1aa9204b-5956-41a3-96b6-58cbf6bc147e',1,2,3,4,5,6]], 
             columns=['id', 'feature_a_7', 'feature_a_14', 'feature_a_30', 
                      'feature_b_7', 'feature_b_14', 'feature_b_30'])

Unnamed: 0,id,feature_a_7,feature_a_14,feature_a_30,feature_b_7,feature_b_14,feature_b_30
0,1aa9204b-5956-41a3-96b6-58cbf6bc147e,1,2,3,4,5,6
