## Working with Dates and Time Series Data

In [2]:
import pandas as pd
import numpy as np

資料來源 :https://www.kaggle.com/prasoonkottarathil/ethereum-historical-dataset

In [3]:
df = pd.read_csv('../pandas_dataset/eth_1hr/ETH_1H.csv')

In [4]:
df.head()

Unnamed: 0,Unix Timestamp,Date,Symbol,Open,High,Low,Close,Volume
0,1586995200000,2020-04-16 00:00:00,ETHUSD,152.94,152.94,150.39,150.39,650.188125
1,1586991600000,2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299
2,1586988000000,2020-04-15 22:00:00,ETHUSD,157.18,157.3,155.32,155.81,106.337279
3,1586984400000,2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131
4,1586980800000,2020-04-15 20:00:00,ETHUSD,157.1,158.1,156.87,158.04,144.262622


In [5]:
df = df.drop(columns = ['Unix Timestamp']) # 先把用不到的列切掉

In [6]:
df.head()

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume
0,2020-04-16 00:00:00,ETHUSD,152.94,152.94,150.39,150.39,650.188125
1,2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299
2,2020-04-15 22:00:00,ETHUSD,157.18,157.3,155.32,155.81,106.337279
3,2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131
4,2020-04-15 20:00:00,ETHUSD,157.1,158.1,156.87,158.04,144.262622


In [7]:
df.loc[0, 'Date'] # loc 基本搜尋

'2020-04-16 00:00:00'

In [8]:
type(df.loc[0, 'Date']) # 發現日期時間資料為字串(str)

str

In [9]:
df.loc[0, 'Date'].day_name() # 這裡會發現因為資料為字串 並不是 Timestamp 導致無法將年月日轉換為星期幾

AttributeError: 'str' object has no attribute 'day_name'

下面程式的日期時間 format 依照資料的填法參考 : https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior 接近底頁的部分有一個表格

In [10]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S') # to_datetime 是將符合時間戳記格式(Timestamp)的字串(str)資料轉換至 Timestamp 格式 (Timestamp 重要轉換方法)

In [11]:
df.loc[0, 'Date']

Timestamp('2020-04-16 00:00:00')

In [12]:
type(df.loc[0, 'Date'])

pandas._libs.tslibs.timestamps.Timestamp

In [13]:
df.loc[0, 'Date'].day_name() # 因為資料已經轉換成 Timestamp 所以可以將年月日轉換成星期

'Thursday'

In [14]:
d_parser = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') # 簡單的副程式
df_parse = pd.read_csv('../pandas_dataset/eth_1hr/ETH_1H.csv', parse_dates=['Date'], date_parser = d_parser) # 在讀取資料時選定要分析的列 並套用寫好的副程式 可以達到與上面相同的效果

  d_parser = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') # 簡單的副程式


In [15]:
df_parse = df_parse.drop(columns=['Unix Timestamp'])

In [16]:
df_parse.loc[0, 'Date'] # 結果同上面

Timestamp('2020-04-16 00:00:00')

In [17]:
df_parse.loc[0, 'Date'].day_name()

'Thursday'

In [18]:
df_parse['Date'].dt.day_name() # 整列的日期輸出

0         Thursday
1        Wednesday
2        Wednesday
3        Wednesday
4        Wednesday
           ...    
34492       Monday
34493       Monday
34494       Monday
34495       Monday
34496       Monday
Name: Date, Length: 34497, dtype: object

In [19]:
df_parse['DayOfWeek'] = df_parse['Date'].dt.day_name() # 創造新的列索引儲存資料

In [20]:
df_parse

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,DayOfWeek
0,2020-04-16 00:00:00,ETHUSD,152.94,152.94,150.39,150.39,650.188125,Thursday
1,2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299,Wednesday
2,2020-04-15 22:00:00,ETHUSD,157.18,157.30,155.32,155.81,106.337279,Wednesday
3,2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131,Wednesday
4,2020-04-15 20:00:00,ETHUSD,157.10,158.10,156.87,158.04,144.262622,Wednesday
...,...,...,...,...,...,...,...,...
34492,2016-05-09 17:00:00,ETHUSD,9.83,9.83,9.48,9.49,329.553213,Monday
34493,2016-05-09 16:00:00,ETHUSD,9.99,9.99,9.79,9.83,62.379450,Monday
34494,2016-05-09 15:00:00,ETHUSD,10.00,10.00,9.99,9.99,10.973567,Monday
34495,2016-05-09 14:00:00,ETHUSD,9.55,10.00,9.55,10.00,235.774075,Monday


In [21]:
df_parse['Date'].min() # 最小的日期時間

Timestamp('2016-05-09 13:00:00')

In [22]:
df_parse['Date'].max() # 最大的日期時間

Timestamp('2020-04-16 00:00:00')

In [23]:
df_parse['Date'].max() - df_parse['Date'].min() # Timestamp 可以透過簡單的運算得知時間長度

Timedelta('1437 days 11:00:00')

In [24]:
filt = (df['Date'] >= '2020') # Timestamp 還可以指定部分時間做布靈運算 '2020' 為字串 但是在運算時會自動轉換成 Timestamp
df_parse.loc[filt] # 套用過濾器後只會顯示時間年在 2020 之後的時間 (原資料未更改)

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,DayOfWeek
0,2020-04-16 00:00:00,ETHUSD,152.94,152.94,150.39,150.39,650.188125,Thursday
1,2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299,Wednesday
2,2020-04-15 22:00:00,ETHUSD,157.18,157.30,155.32,155.81,106.337279,Wednesday
3,2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131,Wednesday
4,2020-04-15 20:00:00,ETHUSD,157.10,158.10,156.87,158.04,144.262622,Wednesday
...,...,...,...,...,...,...,...,...
2540,2020-01-01 04:00:00,ETHUSD,129.83,129.96,129.13,129.54,2807.129465,Wednesday
2541,2020-01-01 03:00:00,ETHUSD,130.25,130.30,129.42,129.83,883.017780,Wednesday
2542,2020-01-01 02:00:00,ETHUSD,129.95,130.58,129.95,130.25,527.921168,Wednesday
2543,2020-01-01 01:00:00,ETHUSD,128.11,130.01,128.11,129.95,220.239530,Wednesday


In [25]:
filt = (df['Date'] >= '2019') & (df['Date'] < '2020') # 取時間區段在 2019 年內的時間 原理同上
df_parse.loc[filt] # (原資料未更改)

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,DayOfWeek
2545,2019-12-31 23:00:00,ETHUSD,128.68,128.85,127.82,128.84,857.650259,Tuesday
2546,2019-12-31 22:00:00,ETHUSD,128.40,128.93,127.77,128.68,3050.507350,Tuesday
2547,2019-12-31 21:00:00,ETHUSD,127.87,128.41,127.81,128.40,447.680372,Tuesday
2548,2019-12-31 20:00:00,ETHUSD,127.86,128.30,127.86,127.87,151.711128,Tuesday
2549,2019-12-31 19:00:00,ETHUSD,128.78,128.78,127.86,127.86,2450.933248,Tuesday
...,...,...,...,...,...,...,...,...
11297,2019-01-01 04:00:00,ETHUSD,130.83,133.75,130.83,132.09,1035.840465,Tuesday
11298,2019-01-01 03:00:00,ETHUSD,129.79,131.00,129.79,130.83,1307.299291,Tuesday
11299,2019-01-01 02:00:00,ETHUSD,130.98,130.98,129.25,129.79,837.808380,Tuesday
11300,2019-01-01 01:00:00,ETHUSD,131.10,131.10,128.72,130.98,965.092541,Tuesday


In [26]:
filt = (df['Date'] >= pd.to_datetime('2019-01-01')) & (df['Date'] < pd.to_datetime('2020-01-01')) # 取時間區段在 2019 年內的時間另一種寫法 是將字串轉換成 Timestamp 再進入 Dataframe 做布靈運算
df_parse.loc[filt] # (原資料未更改)

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Volume,DayOfWeek
2545,2019-12-31 23:00:00,ETHUSD,128.68,128.85,127.82,128.84,857.650259,Tuesday
2546,2019-12-31 22:00:00,ETHUSD,128.40,128.93,127.77,128.68,3050.507350,Tuesday
2547,2019-12-31 21:00:00,ETHUSD,127.87,128.41,127.81,128.40,447.680372,Tuesday
2548,2019-12-31 20:00:00,ETHUSD,127.86,128.30,127.86,127.87,151.711128,Tuesday
2549,2019-12-31 19:00:00,ETHUSD,128.78,128.78,127.86,127.86,2450.933248,Tuesday
...,...,...,...,...,...,...,...,...
11297,2019-01-01 04:00:00,ETHUSD,130.83,133.75,130.83,132.09,1035.840465,Tuesday
11298,2019-01-01 03:00:00,ETHUSD,129.79,131.00,129.79,130.83,1307.299291,Tuesday
11299,2019-01-01 02:00:00,ETHUSD,130.98,130.98,129.25,129.79,837.808380,Tuesday
11300,2019-01-01 01:00:00,ETHUSD,131.10,131.10,128.72,130.98,965.092541,Tuesday


In [27]:
df_parse.set_index('Date', inplace=True) # 將列 'Date' 設定為行 (原資料已更改)
df_parse

Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayOfWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-16 00:00:00,ETHUSD,152.94,152.94,150.39,150.39,650.188125,Thursday
2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299,Wednesday
2020-04-15 22:00:00,ETHUSD,157.18,157.30,155.32,155.81,106.337279,Wednesday
2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131,Wednesday
2020-04-15 20:00:00,ETHUSD,157.10,158.10,156.87,158.04,144.262622,Wednesday
...,...,...,...,...,...,...,...
2016-05-09 17:00:00,ETHUSD,9.83,9.83,9.48,9.49,329.553213,Monday
2016-05-09 16:00:00,ETHUSD,9.99,9.99,9.79,9.83,62.379450,Monday
2016-05-09 15:00:00,ETHUSD,10.00,10.00,9.99,9.99,10.973567,Monday
2016-05-09 14:00:00,ETHUSD,9.55,10.00,9.55,10.00,235.774075,Monday


In [28]:
df_parse['2020'] # 影片中 19:35 終舊版 pandas 可以這樣搜尋 2020 年內的資料 但是新版不行

  df_parse['2020'] # 影片中 19:35 終舊版 pandas 可以這樣搜尋 2020 年內的資料 但是新版不行


Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayOfWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-16 00:00:00,ETHUSD,152.94,152.94,150.39,150.39,650.188125,Thursday
2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299,Wednesday
2020-04-15 22:00:00,ETHUSD,157.18,157.30,155.32,155.81,106.337279,Wednesday
2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131,Wednesday
2020-04-15 20:00:00,ETHUSD,157.10,158.10,156.87,158.04,144.262622,Wednesday
...,...,...,...,...,...,...,...
2020-01-01 04:00:00,ETHUSD,129.83,129.96,129.13,129.54,2807.129465,Wednesday
2020-01-01 03:00:00,ETHUSD,130.25,130.30,129.42,129.83,883.017780,Wednesday
2020-01-01 02:00:00,ETHUSD,129.95,130.58,129.95,130.25,527.921168,Wednesday
2020-01-01 01:00:00,ETHUSD,128.11,130.01,128.11,129.95,220.239530,Wednesday


In [29]:
df_parse.sort_index()['2020'] # 新版必須先透過 pandas 的升冪排列後 才能搜尋 (原資料未更改)

  df_parse.sort_index()['2020'] # 新版必須先透過 pandas 的升冪排列後 才能搜尋 (原資料未更改)


Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayOfWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01 00:00:00,ETHUSD,128.84,128.84,128.07,128.11,526.980282,Wednesday
2020-01-01 01:00:00,ETHUSD,128.11,130.01,128.11,129.95,220.239530,Wednesday
2020-01-01 02:00:00,ETHUSD,129.95,130.58,129.95,130.25,527.921168,Wednesday
2020-01-01 03:00:00,ETHUSD,130.25,130.30,129.42,129.83,883.017780,Wednesday
2020-01-01 04:00:00,ETHUSD,129.83,129.96,129.13,129.54,2807.129465,Wednesday
...,...,...,...,...,...,...,...
2020-04-15 20:00:00,ETHUSD,157.10,158.10,156.87,158.04,144.262622,Wednesday
2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131,Wednesday
2020-04-15 22:00:00,ETHUSD,157.18,157.30,155.32,155.81,106.337279,Wednesday
2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299,Wednesday


In [30]:
df_parse.sort_index() # pandas 的升冪排列結果 (原資料未更改)

Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayOfWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-09 13:00:00,ETHUSD,0.00,12.00,0.00,9.55,432.562115,Monday
2016-05-09 14:00:00,ETHUSD,9.55,10.00,9.55,10.00,235.774075,Monday
2016-05-09 15:00:00,ETHUSD,10.00,10.00,9.99,9.99,10.973567,Monday
2016-05-09 16:00:00,ETHUSD,9.99,9.99,9.79,9.83,62.379450,Monday
2016-05-09 17:00:00,ETHUSD,9.83,9.83,9.48,9.49,329.553213,Monday
...,...,...,...,...,...,...,...
2020-04-15 20:00:00,ETHUSD,157.10,158.10,156.87,158.04,144.262622,Wednesday
2020-04-15 21:00:00,ETHUSD,158.04,158.31,157.16,157.18,55.244131,Wednesday
2020-04-15 22:00:00,ETHUSD,157.18,157.30,155.32,155.81,106.337279,Wednesday
2020-04-15 23:00:00,ETHUSD,155.81,155.81,151.39,152.94,4277.567299,Wednesday


In [31]:
df_parse.sort_index()['2020-01':'2020-02'] # 時間區段搜尋概念亦同 (原資料未更改)

Unnamed: 0_level_0,Symbol,Open,High,Low,Close,Volume,DayOfWeek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-01 00:00:00,ETHUSD,128.84,128.84,128.07,128.11,526.980282,Wednesday
2020-01-01 01:00:00,ETHUSD,128.11,130.01,128.11,129.95,220.239530,Wednesday
2020-01-01 02:00:00,ETHUSD,129.95,130.58,129.95,130.25,527.921168,Wednesday
2020-01-01 03:00:00,ETHUSD,130.25,130.30,129.42,129.83,883.017780,Wednesday
2020-01-01 04:00:00,ETHUSD,129.83,129.96,129.13,129.54,2807.129465,Wednesday
...,...,...,...,...,...,...,...
2020-02-29 19:00:00,ETHUSD,225.05,225.92,224.06,225.56,174.783652,Saturday
2020-02-29 20:00:00,ETHUSD,225.56,225.56,223.55,224.81,158.232905,Saturday
2020-02-29 21:00:00,ETHUSD,224.81,225.08,223.05,223.59,191.602533,Saturday
2020-02-29 22:00:00,ETHUSD,223.59,223.76,222.32,222.93,118.446008,Saturday


In [32]:
df_parse.sort_index()['2020-01':'2020-02']['Close'] # 指定列索引搜尋元素 (原資料未更改)

Date
2020-01-01 00:00:00    128.11
2020-01-01 01:00:00    129.95
2020-01-01 02:00:00    130.25
2020-01-01 03:00:00    129.83
2020-01-01 04:00:00    129.54
                        ...  
2020-02-29 19:00:00    225.56
2020-02-29 20:00:00    224.81
2020-02-29 21:00:00    223.59
2020-02-29 22:00:00    222.93
2020-02-29 23:00:00    217.35
Name: Close, Length: 1440, dtype: float64

In [33]:
df_parse.sort_index()['2020-01':'2020-02']['Close'].mean() # 統計運算 (原資料未更改)

195.22266666666667

In [34]:
df_parse.sort_index()['2020-01-01']['High'].max() # 概念同上 得知在 '2020-01-01' 這天紀錄所有 'High' 中的最大值 (原資料未更改)

  df_parse.sort_index()['2020-01-01']['High'].max() # 概念同上 得知在 '2020-01-01' 這天紀錄所有 'High' 中的最大值 (原資料未更改)


132.57

下面程式 resample 資料格式可以參考 : https://pandas.pydata.org/docs/user_guide/timeseries.html 1/3頁的部分

In [35]:
highs = df_parse['High'].resample('D').max() # 因為資料本身是每一個小時為一個單位 如果想要知道每天 'High' 的最大值則需要對資料進行重新取樣(resample)
highs

Date
2016-05-09     12.00
2016-05-10      9.96
2016-05-11     10.47
2016-05-12     12.00
2016-05-13     11.59
               ...  
2020-04-12    165.37
2020-04-13    159.51
2020-04-14    162.15
2020-04-15    161.52
2020-04-16    152.94
Freq: D, Name: High, Length: 1439, dtype: float64

In [36]:
highs['2020-01-01'] # 可以將 highs 每天的最大值輸出後再搜尋 可以達到跟上面一樣的結果

132.57

In [37]:
%matplotlib inline # 將 matplotlib 導入到 inline shell 中 比較不建議再這邊這樣寫 因為 import 的導入寫法比較清楚

UsageError: unrecognized arguments: # 將 matplotlib 導入到 inline shell 中 比較不建議再這邊這樣寫 因為 import 的導入寫法比較清楚


In [38]:
highs.plot() # 繪圖

<AxesSubplot:xlabel='Date'>

In [39]:
df_parse.resample('W').mean() # 重新取樣為周(W) 計算每一個列索引的 mean (七天計算一次 mean) (原資料未更改)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-05-15,10.140387,10.310516,10.052387,10.205290,70.896402
2016-05-22,13.042262,13.144048,12.965179,13.066964,242.104139
2016-05-29,12.481012,12.555536,12.385357,12.471012,345.546483
2016-06-05,13.586369,13.651667,13.542202,13.594583,281.854432
2016-06-12,14.287500,14.326190,14.260179,14.297798,309.536737
...,...,...,...,...,...
2020-03-22,124.640952,126.387917,122.802262,124.636012,2618.930260
2020-03-29,133.274762,134.287857,132.255714,133.285893,1374.652289
2020-04-05,137.517083,138.314583,136.791310,137.627440,898.945625
2020-04-12,163.601548,164.731071,162.422560,163.693929,1218.547250


In [40]:
df_parse.resample('W').agg({'Close': 'mean', 'High': 'max', 'Low': 'min', 'Volume': 'sum'}) # 在以前有學過 agg 可以用 dict 格式指定給不同列索引統計方式 (原資料未更改)

Unnamed: 0_level_0,Close,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-05-15,10.205290,12.00,0.00,10988.942273
2016-05-22,13.066964,14.77,10.06,40673.495362
2016-05-29,12.471012,14.43,10.41,58051.809091
2016-06-05,13.594583,16.05,12.41,47351.544496
2016-06-12,14.297798,15.75,13.83,52002.171838
...,...,...,...,...
2020-03-22,124.636012,153.17,100.70,439980.283707
2020-03-29,133.285893,144.14,119.17,230941.584515
2020-04-05,137.627440,150.32,124.09,151022.864981
2020-04-12,163.693929,176.74,142.87,204715.937994
