-
Notifications
You must be signed in to change notification settings - Fork 2
/
rev2.py
31 lines (24 loc) · 808 Bytes
/
rev2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import glob
import pandas as pd
from dask import delayed
def counts_by_origin():
frames = []
# For each file
for f in sorted(glob.glob('data/*.csv')):
# Load the dataframe
df = delayed(pd.read_csv)(f,
parse_dates={'Date': [0, 1, 2]},
infer_datetime_format=True)
# Store in list of frames
frames.append(df)
# concatenate all the frames together
df = delayed(pd.concat)(frames)
# Resample by month
by_month = (df.resample('MS', on='Date')
.Origin.value_counts()
.unstack())
# Resample by year
by_year = (df.resample('AS', on='Date')
.Origin.value_counts()
.unstack())
return by_month, by_year