In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
import numpy as np
from pandas import DataFrame

## [ ], loc, iloc

### Use `loc[]` to choose rows and columns by label
### Use `iloc[]` to choose rows and columns by position

In [None]:
df1 = DataFrame(np.random.randn(6,4),
                index=list('abcdef'),
                columns=list('ABCD'))
df1

### Integers are valid labels

In [None]:
df1.index = [1,2,1,2,1,3]
df1

In [None]:
df1.loc[1]
df1.loc[3]

## `[]` v. `loc` and `iloc` 

In [None]:
df1.index = list('abcdef')
df1

In [None]:
df1.loc[:,'A']
df1['A']

In [None]:
df1[1:3]
df1.iloc[1:3]
# df1.loc[1:3] will throw error 

## `[]` does not work in the following situations:
* select a single row with labels
* select a list of rows with labels
* slice columns

In [None]:
%xmode Plain

In [None]:
df1.loc['a']

In [None]:
df1['a']

In [None]:
df1.loc[['a','b']]

In [None]:
df1[['a','b']]

In [None]:
df1.loc[:, 'A':'D']
df1['A':'D']

## `[] is intended` to get *columns* with particular names.

In [None]:
df1

In [None]:
df1[0:1]['A'] = 1000
df1

In [None]:
df1.loc['a','A'] = 100
df1

## Date and Time

 - create a date range
 - work with timestamp data
 - convert string data to a timestamp
 - index and slice time series data

In [None]:
from datetime import datetime

date_rng = pd.date_range(start='10/1/2019', end='10/08/2019', freq='H')
date_rng

In [None]:
type(date_rng[0])

In [None]:
df = pd.DataFrame(date_rng, columns=['date'])
df['data'] = np.random.randint(0,100,size=(len(date_rng)))
df.head(15)

### In order to do time series manipulation, we'll need to have date time index. 

In [None]:
df['datetime'] = pd.to_datetime(df['date'])
df.head()
df2 = df.set_index('datetime')
df2.drop(columns='date', inplace=True)
#Or: df.drop(['date'], axis=1, inplace=True)
df2.head()
df2.shape

In [None]:
string_date_rng = [str(x) for x in date_rng]
string_date_rng[:10]

## Convert the strings to timestamps

In [None]:
timestamp_date_rng = pd.to_datetime(string_date_rng, binfer_datetime_format=True)
timestamp_date_rng

In [250]:
string_date_rng_2 = ['June-01-2018', 'June-02-2018', 'June-03-2018']
pd.to_datetime(string_date_rng_2)

DatetimeIndex(['2018-06-01', '2018-06-02', '2018-06-03'], dtype='datetime64[ns]', freq=None)

In [280]:
timestamp_date_rng_2 = [datetime.strptime(x,'%B-%d-%Y') for x in string_date_rng_2]
timestamp_date_rng_2
timestamp_date_rng_2 = pd.to_datetime(string_date_rng_2, format='%B-%d-%Y')
timestamp_date_rng_2

[datetime.datetime(2018, 6, 1, 0, 0),
 datetime.datetime(2018, 6, 2, 0, 0),
 datetime.datetime(2018, 6, 3, 0, 0)]

DatetimeIndex(['2018-06-01', '2018-06-02', '2018-06-03'], dtype='datetime64[ns]', freq=None)

In [274]:
df3 = pd.DataFrame(timestamp_date_rng_2, columns=['date'])
df3
df3 = pd.DataFrame(pd.to_datetime(string_date_rng_2), columns=['date'])
df3

Unnamed: 0,date
0,2018-06-01
1,2018-06-02
2,2018-06-03


Unnamed: 0,date
0,2018-06-01
1,2018-06-02
2,2018-06-03


## Parse on timestamp index

In [None]:
df2[df2.index.day == 5]

In [None]:
df2['2019-10-05']

In [None]:
df2['2019-10-04':'2019-10-05']

## How to get daily average?

### `pd.resample`: convenience method for frequency conversion and resampling of time series.

In [324]:
df2.resample('D',
             label='left',
             loffset='1s').mean()

Unnamed: 0_level_0,data,rolling_sum,rolling_sum_backfilled
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-01 00:00:01,51.791667,161.318182,160.375
2019-10-02 00:00:01,47.833333,142.208333,142.208333
2019-10-03 00:00:01,47.541667,139.958333,139.958333
2019-10-04 00:00:01,50.041667,152.958333,152.958333
2019-10-05 00:00:01,48.25,143.875,143.875
2019-10-06 00:00:01,45.916667,140.708333,140.708333
2019-10-07 00:00:01,59.125,169.916667,169.916667
2019-10-08 00:00:01,31.0,179.0,179.0


In [322]:
df2['rolling_sum'] = df2.rolling(3).sum()
df2.head(10)

Unnamed: 0_level_0,data,rolling_sum,rolling_sum_backfilled
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-01 00:00:00,48,,150.0
2019-10-01 01:00:00,7,,150.0
2019-10-01 02:00:00,95,150.0,150.0
2019-10-01 03:00:00,15,117.0,117.0
2019-10-01 04:00:00,17,127.0,127.0
2019-10-01 05:00:00,76,108.0,108.0
2019-10-01 06:00:00,3,96.0,96.0
2019-10-01 07:00:00,66,145.0,145.0
2019-10-01 08:00:00,80,149.0,149.0
2019-10-01 09:00:00,74,220.0,220.0


In [None]:
df2['rolling_sum_backfilled'] = df2['rolling_sum'].fillna(method='backfill')
df2.head(10)

In [302]:
rng = pd.date_range('1/1/2012', periods=700, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head(10)
ts.resample('5S', closed='right').sum()

2012-01-01 00:00:00    237
2012-01-01 00:00:01    313
2012-01-01 00:00:02    435
2012-01-01 00:00:03     34
2012-01-01 00:00:04    408
2012-01-01 00:00:05    299
2012-01-01 00:00:06    194
2012-01-01 00:00:07    432
2012-01-01 00:00:08    184
2012-01-01 00:00:09    148
Freq: S, dtype: int64

2011-12-31 23:59:55     237
2012-01-01 00:00:00    1489
2012-01-01 00:00:05    1072
2012-01-01 00:00:10    1395
2012-01-01 00:00:15    1568
                       ... 
2012-01-01 00:11:15    1365
2012-01-01 00:11:20    1435
2012-01-01 00:11:25    1214
2012-01-01 00:11:30     967
2012-01-01 00:11:35    1314
Freq: 5S, Length: 141, dtype: int64

## Divide a given date into features – `pd.Series.dt`

* `pandas.Series.dt.year` returns the year of the date time.
* `pandas.Series.dt.month` returns the month of the date time.
* `pandas.Series.dt.day` returns the day of the date time.
* `pandas.Series.dt.hour` returns the hour of the date time.
* `pandas.Series.dt.minute` returns the minute of the date time.



In [334]:
s = pd.date_range('2019-01-01', '2019-01-05').to_series()
s.dt.weekday_name
s.resample('B',closed='right').last().dt.weekday_name

2019-01-01      Tuesday
2019-01-02    Wednesday
2019-01-03     Thursday
2019-01-04       Friday
2019-01-05     Saturday
Freq: D, dtype: object

2018-12-31      Tuesday
2019-01-01    Wednesday
2019-01-02     Thursday
2019-01-03       Friday
2019-01-04     Saturday
Freq: B, dtype: object

In [None]:
# Create date and time with dataframe 
rng = pd.DataFrame() 
rng['date'] = pd.date_range('1/1/2019', periods = 72, freq ='H') 

In [248]:
# Print the dates in dd-mm-yy format 
rng[:5] 

# Create features for year, month, day, hour, and minute 
rng['year'] = rng['date'].dt.year 
rng['month'] = rng['date'].dt.month 
rng['day'] = rng['date'].dt.day 
rng['hour'] = rng['date'].dt.hour 
rng['minute'] = rng['date'].dt.minute 

# Print the dates divided into features 
rng[:3]

Unnamed: 0,date,year,month,day,hour,minute
0,2019-01-01 00:00:00,2019,1,1,0,0
1,2019-01-01 01:00:00,2019,1,1,1,0
2,2019-01-01 02:00:00,2019,1,1,2,0
3,2019-01-01 03:00:00,2019,1,1,3,0
4,2019-01-01 04:00:00,2019,1,1,4,0


Unnamed: 0,date,year,month,day,hour,minute
0,2019-01-01 00:00:00,2019,1,1,0,0
1,2019-01-01 01:00:00,2019,1,1,1,0
2,2019-01-01 02:00:00,2019,1,1,2,0
