## Rolling window
* https://towardsdatascience.com/dont-miss-out-on-rolling-window-functions-in-pandas-850b817131db

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_html("https://finance.yahoo.com/quote/TSLA/history?period1=1546300800&period2=1550275200&interval=1d&filter=history&frequency=1d")[0]
df = df.head(11).sort_values(by='Date')
df = df.astype({"Open":'float',
                "High":'float',
                "Low":'float',
                "Close*":'float',
                "Adj Close**":'float',
                "Volume":'float'})
df['Gain'] = df['Close*'] - df['Open']

In [3]:
df['Rolling Close Average'] = df['Close*'].rolling(2).mean()

In [4]:
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume,Gain,Rolling Close Average
10,"Feb 01, 2019",61.08,63.22,60.7,62.44,62.44,36417000.0,1.36,
9,"Feb 04, 2019",62.6,63.06,60.38,62.58,62.58,36760500.0,-0.02,62.51
8,"Feb 05, 2019",62.5,64.49,62.45,64.27,64.27,33714000.0,1.77,63.425
7,"Feb 06, 2019",63.92,64.85,63.12,63.44,63.44,25192500.0,-0.48,63.855
6,"Feb 07, 2019",62.66,62.94,60.6,61.5,61.5,32603000.0,-1.16,62.47
5,"Feb 08, 2019",61.37,61.49,59.7,61.16,61.16,29221000.0,-0.21,61.33
4,"Feb 11, 2019",62.32,63.72,62.1,62.57,62.57,35648500.0,0.25,61.865
3,"Feb 12, 2019",63.24,63.64,61.92,62.36,62.36,27588000.0,-0.88,62.465
2,"Feb 13, 2019",62.47,62.55,61.11,61.63,61.63,25708000.0,-0.84,61.995
1,"Feb 14, 2019",60.68,61.35,60.2,60.75,60.75,26004000.0,0.07,61.19


In [5]:
# df['Open Standard Deviation'] = df['Open'].std()
# df['Rolling Open Standard Deviation'] = df['Open'].rolling(2).std()
# df.head(10)

In [6]:
df['Rolling Volume Sum'] = df['Volume'].rolling(3).sum()
df.head(10)

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume,Gain,Rolling Close Average,Rolling Volume Sum
10,"Feb 01, 2019",61.08,63.22,60.7,62.44,62.44,36417000.0,1.36,,
9,"Feb 04, 2019",62.6,63.06,60.38,62.58,62.58,36760500.0,-0.02,62.51,
8,"Feb 05, 2019",62.5,64.49,62.45,64.27,64.27,33714000.0,1.77,63.425,106891500.0
7,"Feb 06, 2019",63.92,64.85,63.12,63.44,63.44,25192500.0,-0.48,63.855,95667000.0
6,"Feb 07, 2019",62.66,62.94,60.6,61.5,61.5,32603000.0,-1.16,62.47,91509500.0
5,"Feb 08, 2019",61.37,61.49,59.7,61.16,61.16,29221000.0,-0.21,61.33,87016500.0
4,"Feb 11, 2019",62.32,63.72,62.1,62.57,62.57,35648500.0,0.25,61.865,97472500.0
3,"Feb 12, 2019",63.24,63.64,61.92,62.36,62.36,27588000.0,-0.88,62.465,92457500.0
2,"Feb 13, 2019",62.47,62.55,61.11,61.63,61.63,25708000.0,-0.84,61.995,88944500.0
1,"Feb 14, 2019",60.68,61.35,60.2,60.75,60.75,26004000.0,0.07,61.19,79300000.0


## Cumsum
* https://stackoverflow.com/questions/56058901/pandas-cumulative-sum-of-all-previous-dates-by-group

In [21]:
df = pd.DataFrame({
    'Date': ['2018-04-01', '2018-04-01', '2018-04-05', '2018-04-05', '2018-05-01'],
    'Product': ['a', 'a', 'a', 'b', 'b'],
    'Volumes': [10,30,40,50,60]})

df

Unnamed: 0,Date,Product,Volumes
0,2018-04-01,a,10
1,2018-04-01,a,30
2,2018-04-05,a,40
3,2018-04-05,b,50
4,2018-05-01,b,60


#### groupby only product

In [22]:
df['Result_1'] = df.groupby('Product')['Volumes'].cumsum()
df

Unnamed: 0,Date,Product,Volumes,Result_1
0,2018-04-01,a,10,10
1,2018-04-01,a,30,40
2,2018-04-05,a,40,80
3,2018-04-05,b,50,50
4,2018-05-01,b,60,110


#### groupby Date and product

In [23]:
df.groupby(['Product', 'Date'])['Volumes'].sum()

Product  Date      
a        2018-04-01    40
         2018-04-05    40
b        2018-04-05    50
         2018-05-01    60
Name: Volumes, dtype: int64

In [24]:
df.groupby(['Product', 'Date'])['Volumes'].sum().groupby(['Product']).cumsum()

Product  Date      
a        2018-04-01     40
         2018-04-05     80
b        2018-04-05     50
         2018-05-01    110
Name: Volumes, dtype: int64

In [25]:
df.groupby(['Product', 'Date'])['Volumes'].sum().groupby(['Product']).cumsum().reset_index()

Unnamed: 0,Product,Date,Volumes
0,a,2018-04-01,40
1,a,2018-04-05,80
2,b,2018-04-05,50
3,b,2018-05-01,110


## Rank, cumcount
* https://stackoverflow.com/questions/17775935/sql-like-window-functions-in-pandas-row-numbering-in-python-pandas-dataframe


 * Partition the table of data by one or more fields
 * For each partition, add a rownumber to each of its rows that ranks the row
 
#### cumcount() start from `0`

In [31]:
df = pd.DataFrame({'key1' : ['a','a','a','b','a'],
           'data1' : [1,2,2,3,3],
           'data2' : [1,10,2,3,30]})
df

Unnamed: 0,key1,data1,data2
0,a,1,1
1,a,2,10
2,a,2,2
3,b,3,3
4,a,3,30


In [32]:
df['rank'] = df.sort_values(['data1','data2'], ascending=[True,False]) \
                 .groupby(['key1']) \
                 .cumcount() + 1

df

Unnamed: 0,key1,data1,data2,rank
0,a,1,1,1
1,a,2,10,2
2,a,2,2,3
3,b,3,3,1
4,a,3,30,4


In [34]:
# 如果只需要 按照一个 排序
df = pd.DataFrame({'C1' : ['a','a','a','b','b'],
           'C2' : [1,2,3,4,5]})
df['Rank'] = df.groupby(by=['C1'])['C2'].transform(lambda x: x.rank())
df

Unnamed: 0,C1,C2,Rank
0,a,1,1
1,a,2,2
2,a,3,3
3,b,4,1
4,b,5,2


In [36]:
df["RANK2"] = df.groupby("C1")["C2"].rank(method="first", ascending=True).astype(int)
df

Unnamed: 0,C1,C2,Rank,RANK2
0,a,1,1,1
1,a,2,2,2
2,a,3,3,3
3,b,4,1,1
4,b,5,2,2
