In [2]:
import datetime as dt
import pandas as pd

### LAG/LEAD: calculate deltas from totals

Spark SQL equivalent:

```
select *,
    lag(total, 1, 0) over (partition by url, service order by ts) as prev
from df
```

Spark code:
```
df = sqlContext.read.csv('social_totals.csv', header=True)
df.createOrReplaceTempView('df')
df = sqlContext.sql('select *, lag(total, 1, 0) over (partition by url, service order by ts) as prev from df')
df = df.withColumn('delta', df['total'] - df['prev'])
```

In [14]:
df = pd.read_csv('social_totals.csv')
df.head(3)

Unnamed: 0,url,ts,service,total
0,url1,2018-08-15 00:00:00,tw,1
1,url1,2018-08-15 00:05:00,tw,4
2,url1,2018-08-15 00:11:00,tw,5


In [19]:
# using groupby
df = pd.read_csv('social_totals.csv')
df = df.assign(prev=df.sort_values(['ts']).groupby(['url', 'service']).total.shift(1).fillna(0).astype(int))
df = df.assign(delta=df['total']-df['prev'])
df.head(3)

Unnamed: 0,url,ts,service,total,prev,delta
0,url1,2018-08-15 00:00:00,tw,1,0,1
1,url1,2018-08-15 00:05:00,tw,4,1,3
2,url1,2018-08-15 00:11:00,tw,5,4,1


In [55]:
# using rolling
df = pd.read_csv('social_totals.csv')
df.sort_values(['ts']).groupby(['url', 'service']).total.diff(1)

0       NaN
18      NaN
9       NaN
24      NaN
19      6.0
25      2.0
10     15.0
1       3.0
26      7.0
20     13.0
2       1.0
11     11.0
27      6.0
21     80.0
3       3.0
12     14.0
28      9.0
22    455.0
4       4.0
13     14.0
23    645.0
29     13.0
5      13.0
14      8.0
15     43.0
6      16.0
16    120.0
7       4.0
8      14.0
17    130.0
Name: total, dtype: float64

### SUM OVER: calculate totals from deltas

Spark SQL:

```
select *, 
    sum(cast(delta as long)) over (partition by url, service order by ts) as total 
from social_deltas
```

Spark code:

```
df = sqlContext.read.csv('social_deltas.csv', header=True)
df.createOrReplaceTempView('df')
df = sqlContext.sql('select *, sum(cast(delta as long)) over (partition by url, service order by ts) as total from df')
```

In [7]:
df = pd.read_csv('social_deltas.csv')
df.head(3)

Unnamed: 0,url,ts,service,delta
0,url1,2018-08-15 00:00:00,tw,1
1,url1,2018-08-15 00:05:00,tw,3
2,url1,2018-08-15 00:11:00,tw,1


In [25]:
df = pd.read_csv('social_deltas.csv')
df = df.assign(total=df.sort_values(['ts']).groupby(['url', 'service']).delta.cumsum())
df.head(3)

Unnamed: 0,url,ts,service,delta,total
0,url1,2018-08-15 00:00:00,tw,1,1
1,url1,2018-08-15 00:05:00,tw,3,4
2,url1,2018-08-15 00:11:00,tw,1,5


### RANK, DENSE_RANK, PERCENT_RANK: rank things in group

Spark code:

```
df = sqlContext.read.csv('social_totals_agg.csv', header=True, inferSchema=True)
df = sqlContext.sql("""
select * from (
   select *,
          rank() over (partition by service order by total desc) as rnk
   from df)
where rnk <= 3
""")
```

to select top 3 urls for each service by number of social shares.

In [26]:
df = pd.read_csv('social_totals.csv')
df = df.assign(rn=df.sort_values(['ts']).groupby(['url', 'service']).rank(method='first'))
df.head(3)

Unnamed: 0,url,ts,service,total,rn
0,url1,2018-08-15 00:00:00,tw,1,1
1,url1,2018-08-15 00:05:00,tw,4,2
2,url1,2018-08-15 00:11:00,tw,5,3


### ROWS/RANGE: resize the window

Spark code: TODO

In [44]:
df = pd.read_csv('social_totals.csv')
df.sort_values('ts').rolling(1, on='ts').total.mean()

0        1.0
18       1.0
9        5.0
24       1.0
19       7.0
25       3.0
10      20.0
1        4.0
26      10.0
20      20.0
2        5.0
11      31.0
27      16.0
21     100.0
3        8.0
12      45.0
28      25.0
22     555.0
4       12.0
13      59.0
23    1200.0
29      38.0
5       25.0
14      67.0
15     110.0
6       41.0
16     230.0
7       45.0
8       59.0
17     360.0
Name: total, dtype: float64