## Setup

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Pandas - Date & Time

In [None]:
# 주가 데이터 가져오기
df = pd.read_csv("./all_stocks_2006-01-01_to_2018-01-01.csv")
df.head()

In [None]:
df

In [None]:
# 종목별 종가
df = pd.pivot(data=df, index='Date', columns='Name', values='Close')
df.head()

In [None]:
# 애플, 아마존 선택
df = df[['AAPL', 'AMZN']]
df.head()

In [None]:
# 인덱스 초기화
df = df.reset_index()
# 인덱스를 초기화 한 것에 대해 컬럼으로 들어오게 된다.
#앞에서 다섯줄
df.head()

In [None]:
# 뒷부분의 자료를 보고 싶다면 tail
df.tail()

In [None]:
# 자료형 확인
df.info()

In [None]:
# 문자열 -> datetime 자료형
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df.info()

In [None]:
# 날짜 인덱스 변환
df = df.set_index('Date')
df.head()

In [None]:
# pandas plot
df.plot()

### Timestamp

In [None]:
# timestamp 
ts = pd.Timestamp("2022-03-25 07:23:59")
ts

In [None]:
# timestamp 속성
print("year:", ts.year)#연도
print("month:", ts.month)#월
print("quarter:", ts.quarter)#분기
print("day:", ts.day)#일
print("weekofyear:", ts.weekofyear)# 1년 중 몇번째 주
print("dayofyear:", ts.dayofyear)# 1년 중 몇번째 날
print("dayofweek:", ts.dayofweek)#주의 몇번째 날
print("weekday:", ts.weekday())
print("hour:", ts.hour)
print("minute:", ts.minute)
print("second:", ts.second)

### date range  
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases

In [None]:
date_ids = pd.date_range(start="2020-01-01", end="2021-06-30", freq="M")
date_ids

In [None]:
#2주간격
date_ids = pd.date_range(start="2020-01-01", end="2021-06-30", freq="2M")
date_ids

In [None]:
date_ids[0]

### time delta

In [None]:
#시간의 차이 5번째 일에서 0번째일을 빼자
days = date_ids[5] - date_ids[0]
days

In [None]:
#days의 날짜만 뽑자
days.days

In [None]:
#days의 시간만 뽑자
pd.Timedelta(days=1)

### (DateTimeIndex) Slicing

In [None]:
# 시간 기준으로 Slicing 할 수 있다.
df.head()

In [None]:
#해당 범위값만 추출하고 싶다고 할 때 쓸 수 있다.
df['2006-03':'2006-05']

In [None]:
df['2006':'2007']

# 결측값 처리

In [None]:
# 결측값 생성
apple = df[['AAPL']]

np.random.seed(20)
apple.iloc[np.random.randint(len(df), size=300), 0] = np.nan
apple.isnull().sum()

### mean / median

In [None]:
apple_mean = apple.copy()
#결측치를 채워주는 함수
#나머지 날짜들의 평균값들을 이용해서 결측치를 채워준다.
apple_mean['AAPL'] = apple_mean['AAPL'].fillna(apple_mean['AAPL'].mean())
apple_mean.isnull().sum()

In [None]:
apple_median = apple.copy()
apple_median['AAPL'] = apple_median['AAPL'].fillna(apple_median['AAPL'].median())
apple_median.isnull().sum()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.kdeplot(data=apple, linewidth=5, ax=ax, label="original")
sns.kdeplot(data=apple_mean, linewidth=2, linestyle='--', ax=ax, label="mean")
sns.kdeplot(data=apple_median, linewidth=2, linestyle=':', ax=ax, label="median")
plt.legend()
plt.show()

### ffill / bfill

In [None]:
test = pd.Series([1, 2, 3, np.nan, 4, 5])
test

In [None]:
test.fillna(method='ffill')

In [None]:
test.fillna(method='bfill')

In [None]:
apple_ffill = apple.copy()
apple_ffill['AAPL'] = apple_ffill['AAPL'].fillna(method='ffill')
apple_ffill.isnull().sum()

In [None]:
apple_bfill = apple.copy()
apple_bfill['AAPL'] = apple_bfill['AAPL'].fillna(method='bfill')
apple_bfill.isnull().sum()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.kdeplot(data=apple, linewidth=5, ax=ax, label="original")
sns.kdeplot(data=apple_ffill, linewidth=2, linestyle='--', ax=ax, label="ffill")
sns.kdeplot(data=apple_bfill, linewidth=2, linestyle=':', ax=ax, label="bfill")
plt.legend()
plt.show()

### interpolate

In [None]:
# 1-5까지 중 4개를 결측으로 채워놓았다.
test = pd.Series([1, np.nan, np.nan, np.nan, np.nan, 5])
test

In [None]:
test.interpolate(method='linear')
# linear 선형추론

In [None]:
test.interpolate(method='nearest')

In [None]:
apple_linear = apple.copy()
apple_linear['AAPL'] = apple_linear['AAPL'].interpolate(method='linear')
apple_linear.isnull().sum()

In [None]:
apple_nearest = apple.copy()
apple_nearest['AAPL'] = apple_nearest['AAPL'].interpolate(method='nearest')
apple_nearest.isnull().sum()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.kdeplot(data=apple, linewidth=5, ax=ax, label="original")
sns.kdeplot(data=apple_linear, linewidth=2, linestyle='--', ax=ax, label="linear")
sns.kdeplot(data=apple_nearest, linewidth=2, linestyle=':', ax=ax, label="nearest")
plt.legend()
plt.show()

### ML 모델

In [None]:
from sklearn.impute import KNNImputer

apple_knn = apple.copy()

for k in [3, 5, 7]:
    imp = KNNImputer(n_neighbors=k)
    apple_knn[f"k={k}"] = imp.fit_transform(apple_knn["AAPL"].values.reshape(-1,1))

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.kdeplot(data=apple, linewidth=5, ax=ax, label="original")
sns.kdeplot(data=apple_knn['k=3'], linewidth=2, linestyle='--', ax=ax, label="k=3")
sns.kdeplot(data=apple_knn['k=5'], linewidth=2, linestyle=':', ax=ax, label="k=5")
sns.kdeplot(data=apple_knn['k=7'], linewidth=2, linestyle='-.', ax=ax, label="k=7")
plt.legend()
plt.show()

# lag

In [None]:
apple_linear['AAPL'].shift(1)
# 직전시점의 데이터

In [None]:
apple_linear['AAPL'].shift(-2)

In [None]:
apple_lags = apple_linear.copy()
apple_lags['lag_1'] = apple_lags['AAPL'].shift(1)
apple_lags['lag_2'] = apple_lags['AAPL'].shift(2)
apple_lags

In [None]:
apple_lags['diff_lag_1'] = apple_lags['AAPL'] - apple_lags['lag_1']
apple_lags

In [None]:
apple_lags.plot(figsize=(12, 8), subplots=True)

In [None]:
apple_lags['diff_lag_1'].plot()

In [None]:
# diff 함수

apple_lags['AAPL'].diff(1).plot()

In [None]:
# percent change
apple_lags['AAPL'].pct_change().plot()

# Resampling

### asfreq 

In [None]:
apple_linear

In [None]:
apple_linear.asfreq('D')

In [None]:
apple_linear.asfreq('10d')

### resample

In [None]:
apple_linear.resample('D')

In [None]:
apple_linear.resample('D').mean()

In [None]:
apple_linear.resample('10d').mean()

In [None]:
apple_linear.resample('10d').max()

In [None]:
apple_linear.resample('10d').first()

In [None]:
apple_linear.resample('M').agg(['min', 'max', 'mean', 'std'])

In [None]:
# 평활화 (smoothing)
monthly_apple = apple_linear.resample("M")["AAPL"].mean()
quarterly_apple = apple_linear.resample("Q")["AAPL"].mean()
yearly_apple = apple_linear.resample("Y")["AAPL"].mean()

plt.figure(figsize=(14, 5))
apple_linear['AAPL'].plot(label='original')
monthly_apple.plot(label='monthly')
quarterly_apple.plot(label='quarterly')
yearly_apple.plot(label='yearly')
plt.legend()

# div 비교

In [None]:
df.head()

In [None]:
df.plot(figsize=(15, 4))

In [None]:
df_div = df.div(df.iloc[0])
df_div.head()

In [None]:
df_div.plot(figsize=(15, 4))

# rolling window

In [None]:
df.rolling(window=5)

In [None]:
df.rolling(window=5).mean()

In [None]:
moving_average_day5 = df.rolling(window=5).mean()
moving_average_day120 = df.rolling(window=120).mean()

plt.figure(figsize=(14, 5))
apple_linear['AAPL'].plot(label='original')
moving_average_day5['AAPL'].plot(label='ma_5d')
moving_average_day120['AAPL'].plot(label='ma_120d')
plt.legend()

# expanding window

In [None]:
# 누적 합계 - expanding
df["AAPL"].expanding(min_periods=1).sum()  # min_periods : 초기 윈도우 크기 설정

In [None]:
df["AAPL"].cumsum()

In [None]:
# 누적 최솟값
df["AAPL"].expanding(min_periods=1).min() 

In [None]:
df["AAPL"].cummin()

In [None]:
# 누적 최댓값
df["AAPL"].expanding(min_periods=1).max() 

In [None]:
df["AAPL"].cummax()

In [None]:
# 시각화 
plt.figure(figsize=(14, 5))
df['AAPL'].plot(label='original')
df["AAPL"].cummax().plot(label='max')
df["AAPL"].cummin().plot(label='min')
plt.legend()