# Dates and times in pandas

Pandas can handle dates and times thanks to specific data types.

It has objects for points in time as well as periods of time with methods and attributes that reflect time related details

Sequences of dates and periods can be found as:
- series
- dataframe columns
- indexes

# Timestamp, Period, Frequency, date_range

In [None]:
import pandas as pd 
from datetime import datetime 
import seaborn as sns
import matplotlib.pyplot as plt

time_stamp = pd.Timestamp(datetime(2024,1,1))
pd.Timestamp('2024-01-01') == time_stamp

In [None]:
type(time_stamp)

In [None]:
time_stamp

In [None]:
time_stamp.year

In [None]:
time_stamp.month

In [None]:
time_stamp.day_of_week

In [None]:
period = pd.Period('2024-01')
period

In [None]:
period.asfreq('D')

In [None]:
period.to_timestamp()

In [None]:
period.to_timestamp().to_period('M')

In [None]:
period + 2

In [None]:
index = pd.date_range(start='2017-1-1', periods=12, freq='ME')
index

In [None]:
index[0]

In [None]:
index.to_period()

There are many frequency aliases besides 'ME' and 'D':

| Period   | Alias |
|----------|-------|
| Hour     | H     |
| Day      | D     |
| Week     | W     |
| Month    | ME    |
| Quarter  | Q     |
| Year     | A     |

These may be further differentiated by beginning/end of period, or business-specific definition.

You can also access these `pd.Timestamp()` attributes:

| attribute                            |
|--------------------------------------|
| .second, .minute, .hour              |
| .day, .month, .quarter, .year        |
| .weekday                             |
| dayofweek                            |
| .weekofyear                          |
| .dayofyear                           |

In [None]:
# Parsing strings into datetime

In [None]:
google = pd.read_csv('../data/stock_data/google.csv')
google.head()

In [None]:
google.info()

In [None]:
google['Date'] = pd.to_datetime(google['Date'])

In [None]:
google.set_index('Date', inplace=True)

In [None]:
google.head()

In [None]:
sns.set()
google.plot(title='Google Stock Price')
plt.tight_layout()
plt.show()

In [None]:
google.loc['2015-1'].info()

In [None]:
google.loc['2015':'2016'].describe()

In [None]:
# We can add frequency information to the dataframe with 

google.asfreq('D')

In [None]:
google.info()

In [None]:
google.head()

We have empty values in our dataframe due to the **upsampling** (including weekendays)

**'B'** frequency stands for business day

In [None]:
google.asfreq('B').head()

# Timeseries Calculations 

## Shifting and Lagging

In [None]:
google = pd.read_csv('../data/stock_data/google.csv', parse_dates=['Date'], index_col='Date')
google.info()

In [None]:
google['shifted'] = google['Close'].shift(1)
google['lagged'] = google['Close'].shift(-1)
google

## Calculate one-period percent change

In [None]:
google['change']= google.Close.div(google.shifted)
google

## Diff

In [None]:
google['diff'] = google.Close.diff() 
google

## Percent Change

In [None]:
google['pct_change'] = google.Close.pct_change()
google

## Compare timeseries growth rate

Whats often done is dividing the whole series by its first data point (and eventually multiply by 100)


In [None]:
google.Close.iloc[0]

In [None]:
normalized = google.Close.div(google.Close.iloc[0]).mul(100)
normalized

In [None]:
normalized.plot()

In [None]:
prices = pd.read_csv('../data/stock_data/stock_data.csv', parse_dates=['Date'], index_col='Date')[['AAPL', 'AMZN', 'TEF']]
prices

In [None]:
prices.plot()

In [None]:
prices = prices.div(prices.iloc[0])

In [None]:
prices.plot()

In [None]:
sp500 = pd.read_csv('../data/stock_data/sp500.csv', parse_dates=['date'], index_col='date')
sp500.info()

In [None]:
prices.info()

In [None]:
sp500 = sp500.loc['2010-01-04':'2016-12-30']

In [None]:
sp500.info()

In [None]:
prices = pd.concat([prices, sp500], axis=1)

In [None]:
prices

In [None]:
prices.isna().sum()

In [None]:
prices = prices.dropna()

In [None]:
prices.isna().sum()

In [None]:
prices['SP500'] = prices['SP500'].div(prices['SP500'].iloc[0])

In [None]:
prices

In [None]:
prices.plot()

## Resampling

When we change the frequency of a dataset it affects the data itself too.

When upsampling you have to tell pandas how to fill the new created datapoints (fill or interpolate)

When downsampling you have to specify how to aggregate the values too.

In [None]:
# Set start and end dates
start = '2016-1-1'
end = '2016-2-29'

# Create monthly_dates here
monthly_dates = pd.date_range(start=start, end=end, freq='M')

# Create and print monthly here
monthly = pd.Series(data=[1,2], index=monthly_dates)
print(monthly)

# Create weekly_dates here
weekly_dates = pd.date_range(start=start, end=end, freq='W')

# Print monthly, reindexed using weekly_dates
print(monthly.reindex(weekly_dates))
print(monthly.reindex(weekly_dates, method='bfill'))
print(monthly.reindex(weekly_dates, method='ffill'))

In [None]:
# Import data here
data = pd.read_csv('../data/stock_data/unrate_2000.csv', parse_dates=['date'], index_col='date')

# Show first five rows of weekly series
print(data.asfreq('W').head())

# Show first five rows of weekly series with bfill option
print(data.asfreq('W', method='bfill').head())

# Create weekly series with ffill option and show first five rows
weekly_ffill = data.asfreq('W', method='ffill')
print(weekly_ffill.head())

# Plot weekly_fill starting 2015 here 
weekly_ffill.loc['2015':].plot()
plt.show()

In [None]:
# Import & inspect data here
data = pd.read_csv('../data/stock_data/debt_unemployment.csv', parse_dates = ['date'], index_col='date')
print(data.info())

# Interpolate and inspect here
interpolated = data.interpolate()
print(interpolated.info())

# Plot interpolated data here
interpolated.plot(secondary_y='Unemployment')
plt.show()

# Downsampling and aggregation methods

Downsampling its about reducing the frequency: daily to monthly, hourly to daily...

Mean? Median? Last value? 



In [None]:
ozone = pd.read_csv('../data/air_quality_data/ozone_nyc.csv', parse_dates=['date'], index_col='date')
ozone

In [None]:
ozone.info()

In [None]:
ozone.asfreq('D')
ozone.info()

In [None]:
# convert to monthly 
ozone.resample('M').mean().head()

In [None]:
#we can aggregate in different manners at once too 
ozone.resample('M').agg(['mean', 'std']).head()

In [None]:
ozone = ozone.loc['2016':]
ax=ozone.plot()
monthly = ozone.resample('M').mean()
monthly.add_suffix('_monthly').plot(ax=ax)

# Window Functions

Windows specify sub periods of your time series

Calculate metrics for sub periods inside the window

Create a new time series of metrics

There are two main window types: 
- Rolling
- Expanding

## Rolling Windows

In [None]:
google = pd.read_csv('../data/stock_data/google.csv', parse_dates=['Date'], index_col='Date')
ax = google.plot()

In [None]:
ax = google.plot()
google.rolling(window='30D').mean().plot(ax=ax)
plt.show()

In [None]:
r = google.Close.rolling('30D').agg(['mean', 'std'])
r.plot(subplots=True)

In [None]:
# Resample, interpolate and inspect ozone data here
ozone = ozone.resample('D').interpolate()
print(data.info())

# Create the rolling window
rolling = ozone.Ozone.rolling(60)

# Insert the rolling quantiles to the monthly returns
ozone['q10'] = rolling.quantile(0.1)
ozone['q50'] = rolling.quantile(0.5)
ozone['q90'] = rolling.quantile(0.9)

# Plot the data
ozone.plot()
plt.show()

## Expanding Windows

Calculate metrics for periods up to current date

New time series reflects all historical values

Two options in pandas
- *.expanding()* (like rolling)
- *.cumsum()*, *.cumprod()*, *.cummin()*, *.cummax()*


In [None]:
df = pd.DataFrame({'data': range(5)})
df['expanding sum']=df.data.expanding().sum() 
df['cumulative sum']=df.data.cumsum() 

df

In [None]:
google = pd.read_csv('../data/stock_data/google.csv', parse_dates=['Date'], index_col='Date')
google['running min'] = google.Close.expanding().min() 
google['running max'] = google.Close.expanding().max() 

In [None]:
google.plot()

# SP500 price simulation



In [None]:
from numpy.random import normal, seed 
from scipy.stats import norm

seed(42)

random_returns = normal(loc=0, scale=0.01, size=1000)
sns.displot(random_returns, kde=True)

In [None]:
return_series = pd.Series(random_returns)
random_prices = return_series.add(1).cumprod().sub(1)
random_prices.mul(100).plot()

In [None]:
data = pd.read_csv('../data/stock_data/sp500.csv', parse_dates=['date'], index_col='date')
data

In [None]:
data['returns'] = data.SP500.pct_change() 
data.plot(subplots=True) 


In [None]:
sns.displot(data.returns.dropna().mul(100), kde=True)

In [None]:
from numpy.random import choice 

sample = data.returns.dropna() 
n_obs = data.returns.count() 
random_walk = choice(sample, size=n_obs) 
random_walk = pd.Series(random_walk, index=sample.index) 
random_walk.head()

# Relationships between time series: Correlation

**Correlation coefficient**: how similar is the pairwise movement (covariance) of two variables around their averages


In [None]:
data = pd.read_csv('../data/stock_data/asset_classes.csv', parse_dates=['DATE'], index_col='DATE')
data.info()

In [None]:
daily_returns = data.pct_change()
daily_returns

In [None]:
sns.jointplot(x='SP500', y='Bonds', data=daily_returns)

In [None]:
sns.jointplot(x='Gold', y='Oil', data=daily_returns)

In [None]:
sns.jointplot(x='Bonds', y='Oil', data=daily_returns)

In [None]:
correlations = daily_returns.corr()
correlations

In [None]:
sns.heatmap(correlations, annot=True)