Demonstration of Time Series Data Engineering with Pandas

Demo File is a time series dataset of positive COVID tests in the state of Maryland "MD_Positive_Test_Cases.csv"

Create Panda Frames

In [1]:
# create date time features of a dataset
from pandas import read_csv
from pandas import DataFrame
series = read_csv('MD_Positive_Test_Cases.csv', header=0, index_col=0, parse_dates=True, squeeze=True)
dataframe = DataFrame()
dataframe['month'] = [series.index[i].month for i in range(len(series))]
dataframe['day'] = [series.index[i].day for i in range(len(series))]
dataframe['cases'] = [series[i] for i in range(len(series))]
print(dataframe.head(5))

   month  day  cases
0      7   27    399
1      7   26   1179
2      7   25   1004
3      7   24   1224
4      7   23   1203


Demonstrate Lag Feature

In [2]:
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
series = read_csv('MD_Positive_Test_Cases.csv', header=0, index_col=0)
cases = DataFrame(series.values)
dataframe = concat([cases.shift(1), cases], axis=1)
dataframe.columns = ['t-1', 't+1']
print(dataframe.head(5))

      t-1   t+1
0     NaN   399
1   399.0  1179
2  1179.0  1004
3  1004.0  1224
4  1224.0  1203


Expanding Lag Window

In [3]:
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
series = read_csv('MD_Positive_Test_Cases.csv', header=0, index_col=0)
cases = DataFrame(series.values)
dataframe = concat([cases.shift(3), cases.shift(2), cases.shift(1), cases], axis=1)
dataframe.columns = ['t-3', 't-2', 't-1', 't+1']
print(dataframe.head(5))

      t-3     t-2     t-1   t+1
0     NaN     NaN     NaN   399
1     NaN     NaN   399.0  1179
2     NaN   399.0  1179.0  1004
3   399.0  1179.0  1004.0  1224
4  1179.0  1004.0  1224.0  1203


Rolling Window Statistics

In [4]:
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
series = read_csv('MD_Positive_Test_Cases.csv', header=0, index_col=0)
cases = DataFrame(series.values)
shifted = cases.shift(1)
window = shifted.rolling(window=2)
means = window.mean()
dataframe = concat([means, cases], axis=1)
dataframe.columns = ['mean(t-2,t-1)', 't+1']
print(dataframe.head(5))

   mean(t-2,t-1)   t+1
0            NaN   399
1            NaN  1179
2          789.0  1004
3         1091.5  1224
4         1114.0  1203


Expanded Window Statistics

In [5]:
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
series = read_csv('MD_Positive_Test_Cases.csv', header=0, index_col=0)
cases = DataFrame(series.values)
window = cases.expanding()
dataframe = concat([window.min(), window.mean(), window.max(), cases.shift(-1)], axis=1)
dataframe.columns = ['min', 'mean', 'max', 't+1']
print(dataframe.head(5))

     min         mean     max     t+1
0  399.0   399.000000   399.0  1179.0
1  399.0   789.000000  1179.0  1004.0
2  399.0   860.666667  1179.0  1224.0
3  399.0   951.500000  1224.0  1203.0
4  399.0  1001.800000  1224.0  1075.0
