# 시계열 관련 NumPy 및 Pandas 기능

- dataframe with a datetime index  

- simple time series plots  

## Key Data Types for Time Series Data
### Key NumPy data types:
1. Array - shape, max/min, argmax/argmin, sum, cumsum, mean, var, std, prod, cumprod, etc.

2. datetime64

3. timedelta64

### Key Pandas data types:
1. Series  
2. DataFrame  
3. Index

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
from datetime import timedelta

from IPython.display import display
import os

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_excel("datasets/Sample - Superstore.xls")
df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2013-152156,2013-11-09,2013-11-12,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2013-152156,2013-11-09,2013-11-12,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2013-138688,2013-06-13,2013-06-17,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2012-108966,2012-10-11,2012-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2012-108966,2012-10-11,2012-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


### Simplify Time Series Data

- 주문 날짜 및 범주별 총 판매액으로 data 를 단순화  

- 인덱스를 재설정하지 않으면 Pandas가 그룹 변수를 인덱스로 설정합니다. 

In [29]:
group_variables =  ['Order Date', 'Category']
outcome_variable =  'Category', 'Sales'
base = df.groupby(group_variables)[outcome_variable].sum().reset_index()
base.head()

Unnamed: 0,Order Date,Category,Sales
0,2011-01-04,Office Supplies,16.448
1,2011-01-05,Office Supplies,288.06
2,2011-01-06,Office Supplies,19.536
3,2011-01-07,Furniture,2573.82
4,2011-01-07,Office Supplies,685.34


In [30]:
print("Columns:", base.columns)
print("Index:", base.index)

Columns: Index(['Order Date', 'Category', 'Sales'], dtype='object')
Index: RangeIndex(start=0, stop=2864, step=1)


In [31]:
base.dtypes

Order Date    datetime64[ns]
Category              object
Sales                float64
dtype: object

### datetime64 format in Numpy

NumPy 날짜 배열은 ns(나노초) 단위의 datetime64 객체입니다. 

In [32]:
order_date = base['Order Date'].values
order_date

array(['2011-01-04T00:00:00.000000000', '2011-01-05T00:00:00.000000000',
       '2011-01-06T00:00:00.000000000', ...,
       '2014-12-31T00:00:00.000000000', '2014-12-31T00:00:00.000000000',
       '2014-12-31T00:00:00.000000000'], dtype='datetime64[ns]')

In [33]:
order_date_daily = np.array(order_date, dtype='datetime64[D]')
order_date_daily

array(['2011-01-04', '2011-01-05', '2011-01-06', ..., '2014-12-31',
       '2014-12-31', '2014-12-31'], dtype='datetime64[D]')

In [34]:
order_date_monthly = np.array(order_date, dtype='datetime64[M]')
order_date_monthly

array(['2011-01', '2011-01', '2011-01', ..., '2014-12', '2014-12',
       '2014-12'], dtype='datetime64[M]')

In [35]:
np.unique(order_date_monthly)

array(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05', '2011-06',
       '2011-07', '2011-08', '2011-09', '2011-10', '2011-11', '2011-12',
       '2012-01', '2012-02', '2012-03', '2012-04', '2012-05', '2012-06',
       '2012-07', '2012-08', '2012-09', '2012-10', '2012-11', '2012-12',
       '2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
       '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
       '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06',
       '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12'],
      dtype='datetime64[M]')

## Working with the Pandas DatetimeIndex

### 기존 변수를 사용하여 인덱스 설정

In [36]:
base.set_index('Order Date', inplace=True)

base.head()

Unnamed: 0_level_0,Category,Sales
Order Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-04,Office Supplies,16.448
2011-01-05,Office Supplies,288.06
2011-01-06,Office Supplies,19.536
2011-01-07,Furniture,2573.82
2011-01-07,Office Supplies,685.34


In [37]:
base.index

DatetimeIndex(['2011-01-04', '2011-01-05', '2011-01-06', '2011-01-07',
               '2011-01-07', '2011-01-07', '2011-01-08', '2011-01-08',
               '2011-01-10', '2011-01-10',
               ...
               '2014-12-28', '2014-12-29', '2014-12-29', '2014-12-29',
               '2014-12-30', '2014-12-30', '2014-12-30', '2014-12-31',
               '2014-12-31', '2014-12-31'],
              dtype='datetime64[ns]', name='Order Date', length=2864, freq=None)

### Subsetting data

이제 DatetimeIndex가 있으며 이를 사용하여 데이터 하위 집합을 선택할 수 있습니다.

In [38]:
base['2011'].head()

Unnamed: 0_level_0,Category,Sales
Order Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-04,Office Supplies,16.448
2011-01-05,Office Supplies,288.06
2011-01-06,Office Supplies,19.536
2011-01-07,Furniture,2573.82
2011-01-07,Office Supplies,685.34


In [42]:
base[base['Category'] == 'Office Supplies']['2012-01':'2012-02'].head()

Unnamed: 0_level_0,Category,Sales
Order Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-02,Office Supplies,139.08
2012-01-03,Office Supplies,17.424
2012-01-04,Office Supplies,72.24
2012-01-05,Office Supplies,233.688
2012-01-06,Office Supplies,31.538


### Datetime Components

Pandas Datetime 변수에는 여러 가지 유용한 구성 요소가 있습니다. DatetimeIndex를 사용하여 월, 연도, 요일, 분기 등과 같은 항목을 추출할 수 있습니다.

In [44]:
base.index.day

Int64Index([ 4,  5,  6,  7,  7,  7,  8,  8, 10, 10,
            ...
            28, 29, 29, 29, 30, 30, 30, 31, 31, 31],
           dtype='int64', name='Order Date', length=2864)

In [45]:
base.index.week

Int64Index([ 1,  1,  1,  1,  1,  1,  1,  1,  2,  2,
            ...
            52,  1,  1,  1,  1,  1,  1,  1,  1,  1],
           dtype='int64', name='Order Date', length=2864)

In [47]:
base.index.dayofweek   # Day of Week: Monday=0, Sunday=6

Int64Index([1, 2, 3, 4, 4, 4, 5, 5, 0, 0,
            ...
            6, 0, 0, 0, 1, 1, 1, 2, 2, 2],
           dtype='int64', name='Order Date', length=2864)

In [48]:
base['DayofWeek'] = base.index.dayofweek
base.head()

Unnamed: 0_level_0,Category,Sales,DayofWeek
Order Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-01-04,Office Supplies,16.448,1
2011-01-05,Office Supplies,288.06,2
2011-01-06,Office Supplies,19.536,3
2011-01-07,Furniture,2573.82,4
2011-01-07,Office Supplies,685.34,4
