In [134]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [135]:
import pandas as pd
import numpy as np
from pandas import DataFrame

## [ ], loc, iloc

### Use `loc[]` to choose rows and columns by label
### Use `iloc[]` to choose rows and columns by position

In [136]:
df1 = DataFrame(np.random.randn(6,4),
                index=list('abcdef'),
                columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.139698,-0.271281,1.89674,0.613193
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396
d,-1.313167,-0.822822,0.979113,0.137711
e,-0.287618,-0.842127,-0.935227,1.886676
f,0.798003,0.014693,-1.726211,-0.275323


### Integers are valid labels

In [137]:
df1.index = [1,2,1,2,1,3]
df1

Unnamed: 0,A,B,C,D
1,0.139698,-0.271281,1.89674,0.613193
2,-1.144205,-0.959415,1.078328,-0.321508
1,-1.003033,0.930326,-1.332825,1.54396
2,-1.313167,-0.822822,0.979113,0.137711
1,-0.287618,-0.842127,-0.935227,1.886676
3,0.798003,0.014693,-1.726211,-0.275323


In [140]:
df1.loc[1]
df1.loc[3]

Unnamed: 0,A,B,C,D
1,0.139698,-0.271281,1.89674,0.613193
1,-1.003033,0.930326,-1.332825,1.54396
1,-0.287618,-0.842127,-0.935227,1.886676


A    0.798003
B    0.014693
C   -1.726211
D   -0.275323
Name: 3, dtype: float64

## `[]` v. `loc` and `iloc` 

In [141]:
df1.index = list('abcdef')
df1

Unnamed: 0,A,B,C,D
a,0.139698,-0.271281,1.89674,0.613193
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396
d,-1.313167,-0.822822,0.979113,0.137711
e,-0.287618,-0.842127,-0.935227,1.886676
f,0.798003,0.014693,-1.726211,-0.275323


In [144]:
df1.loc[:,'A']
df1['A']

a    0.139698
b   -1.144205
c   -1.003033
d   -1.313167
e   -0.287618
f    0.798003
Name: A, dtype: float64

a    0.139698
b   -1.144205
c   -1.003033
d   -1.313167
e   -0.287618
f    0.798003
Name: A, dtype: float64

In [145]:
df1[1:3]
df1.iloc[1:3]
# df1.loc[1:3] 
# will throw error 

Unnamed: 0,A,B,C,D
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396


Unnamed: 0,A,B,C,D
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396


TypeError: cannot do slice indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>

## `[]` does not work in the following situations:
* select a single row with labels
* select a list of rows with labels
* slice columns

In [146]:
%xmode Plain

Exception reporting mode: Plain


In [147]:
df1.loc['a']

A    0.139698
B   -0.271281
C    1.896740
D    0.613193
Name: a, dtype: float64

In [151]:
df1['a']

KeyError: 'a'

In [152]:
df1.loc[['a','b']]

Unnamed: 0,A,B,C,D
a,0.139698,-0.271281,1.89674,0.613193
b,-1.144205,-0.959415,1.078328,-0.321508


In [153]:
df1[['a','b']]

KeyError: "None of [Index(['a', 'b'], dtype='object')] are in the [columns]"

In [155]:
df1.loc[:, 'A':'C']

Unnamed: 0,A,B,C
a,0.139698,-0.271281,1.89674
b,-1.144205,-0.959415,1.078328
c,-1.003033,0.930326,-1.332825
d,-1.313167,-0.822822,0.979113
e,-0.287618,-0.842127,-0.935227
f,0.798003,0.014693,-1.726211


In [157]:
df1['A':'D']

Unnamed: 0,A,B,C,D
a,0.139698,-0.271281,1.89674,0.613193
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396


## `[] is intended` to get *columns* with particular names.

In [158]:
df2 = df1[:]

Unnamed: 0,A,B,C,D
a,0.139698,-0.271281,1.89674,0.613193
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396
d,-1.313167,-0.822822,0.979113,0.137711
e,-0.287618,-0.842127,-0.935227,1.886676
f,0.798003,0.014693,-1.726211,-0.275323


In [159]:
df1[0:1]['A'] = 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [160]:
df1

Unnamed: 0,A,B,C,D
a,0.139698,-0.271281,1.89674,0.613193
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396
d,-1.313167,-0.822822,0.979113,0.137711
e,-0.287618,-0.842127,-0.935227,1.886676
f,0.798003,0.014693,-1.726211,-0.275323


In [161]:
df1.loc['a','A'] = 10000
df1

Unnamed: 0,A,B,C,D
a,10000.0,-0.271281,1.89674,0.613193
b,-1.144205,-0.959415,1.078328,-0.321508
c,-1.003033,0.930326,-1.332825,1.54396
d,-1.313167,-0.822822,0.979113,0.137711
e,-0.287618,-0.842127,-0.935227,1.886676
f,0.798003,0.014693,-1.726211,-0.275323


## Date and Time

 - create a date range
 - work with timestamp data
 - convert string data to a timestamp
 - index and slice time series data

In [166]:
from datetime import datetime

date_rng = pd.date_range(start='10/1/2019', end='10/18/2019', freq='H')
date_rng
type(date_rng[0])

DatetimeIndex(['2019-10-01 00:00:00', '2019-10-01 01:00:00',
               '2019-10-01 02:00:00', '2019-10-01 03:00:00',
               '2019-10-01 04:00:00', '2019-10-01 05:00:00',
               '2019-10-01 06:00:00', '2019-10-01 07:00:00',
               '2019-10-01 08:00:00', '2019-10-01 09:00:00',
               ...
               '2019-10-17 15:00:00', '2019-10-17 16:00:00',
               '2019-10-17 17:00:00', '2019-10-17 18:00:00',
               '2019-10-17 19:00:00', '2019-10-17 20:00:00',
               '2019-10-17 21:00:00', '2019-10-17 22:00:00',
               '2019-10-17 23:00:00', '2019-10-18 00:00:00'],
              dtype='datetime64[ns]', length=409, freq='H')

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
type(date_rng[0])

In [171]:
df = pd.DataFrame(date_rng, columns=['date'])
df

Unnamed: 0,date
0,2019-10-01 00:00:00
1,2019-10-01 01:00:00
2,2019-10-01 02:00:00
3,2019-10-01 03:00:00
4,2019-10-01 04:00:00
...,...
404,2019-10-17 20:00:00
405,2019-10-17 21:00:00
406,2019-10-17 22:00:00
407,2019-10-17 23:00:00


In [172]:
df['data'] = np.random.randint(0,100,size=(len(date_rng)))
df.head(15)

Unnamed: 0,date,data
0,2019-10-01 00:00:00,13
1,2019-10-01 01:00:00,51
2,2019-10-01 02:00:00,83
3,2019-10-01 03:00:00,26
4,2019-10-01 04:00:00,53
5,2019-10-01 05:00:00,91
6,2019-10-01 06:00:00,21
7,2019-10-01 07:00:00,83
8,2019-10-01 08:00:00,22
9,2019-10-01 09:00:00,77


### In order to do time series manipulation, we'll need to have date time index. 

In [169]:
# df['datetime'] = pd.to_datetime(df['date'])
# df.head()

Unnamed: 0,date,data,datetime
0,2019-10-01 00:00:00,99,2019-10-01 00:00:00
1,2019-10-01 01:00:00,11,2019-10-01 01:00:00
2,2019-10-01 02:00:00,31,2019-10-01 02:00:00
3,2019-10-01 03:00:00,38,2019-10-01 03:00:00
4,2019-10-01 04:00:00,42,2019-10-01 04:00:00


In [173]:
df2 = df.set_index('date')
# df2.drop(columns='date', inplace=True)
#Or: df.drop(['date'], axis=1, inplace=True)
df2.head()
df2.shape

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2019-10-01 00:00:00,13
2019-10-01 01:00:00,51
2019-10-01 02:00:00,83
2019-10-01 03:00:00,26
2019-10-01 04:00:00,53


(409, 1)

In [174]:
string_date_rng = [str(x) for x in date_rng]
string_date_rng[:10]

['2019-10-01 00:00:00',
 '2019-10-01 01:00:00',
 '2019-10-01 02:00:00',
 '2019-10-01 03:00:00',
 '2019-10-01 04:00:00',
 '2019-10-01 05:00:00',
 '2019-10-01 06:00:00',
 '2019-10-01 07:00:00',
 '2019-10-01 08:00:00',
 '2019-10-01 09:00:00']

## Convert the strings to timestamps

In [175]:
timestamp_date_rng = pd.to_datetime(string_date_rng, infer_datetime_format=True)
timestamp_date_rng
type(timestamp_date_rng[0])

DatetimeIndex(['2019-10-01 00:00:00', '2019-10-01 01:00:00',
               '2019-10-01 02:00:00', '2019-10-01 03:00:00',
               '2019-10-01 04:00:00', '2019-10-01 05:00:00',
               '2019-10-01 06:00:00', '2019-10-01 07:00:00',
               '2019-10-01 08:00:00', '2019-10-01 09:00:00',
               ...
               '2019-10-17 15:00:00', '2019-10-17 16:00:00',
               '2019-10-17 17:00:00', '2019-10-17 18:00:00',
               '2019-10-17 19:00:00', '2019-10-17 20:00:00',
               '2019-10-17 21:00:00', '2019-10-17 22:00:00',
               '2019-10-17 23:00:00', '2019-10-18 00:00:00'],
              dtype='datetime64[ns]', length=409, freq=None)

pandas._libs.tslibs.timestamps.Timestamp

In [176]:
string_date_rng_2 = ['June-01-2018', 'June-02-2018', 'June-03-2018']
pd.to_datetime(string_date_rng_2)

DatetimeIndex(['2018-06-01', '2018-06-02', '2018-06-03'], dtype='datetime64[ns]', freq=None)

In [None]:
timestamp_date_rng_2 = [datetime.strptime(x,'%B-%d-%Y') for x in string_date_rng_2]
timestamp_date_rng_2

In [None]:
# timestamp_date_rng_2 = pd.to_datetime(string_date_rng_2, format='%B-%d-%Y')
# timestamp_date_rng_2

In [178]:
string_date_rng_2 = ['June-01-2018', 'June-02-2018', 'June-03-2018']
# timestamp_date_rng_2 = [datetime.strptime(x,'%B-%d-%Y') for x in string_date_rng_2]
# timestamp_date_rng_2
timestamp_date_rng_2 = pd.to_datetime(string_date_rng_2, format='%B-%d-%Y')
df3 = pd.DataFrame(timestamp_date_rng_2, columns=['date'])
df3

Unnamed: 0,date
0,2018-06-01
1,2018-06-02
2,2018-06-03


In [179]:
string_date_rng_2 = ['June-01-2018', 'June-02-2018', 'June-03-2018']
df3 = pd.DataFrame(pd.to_datetime(string_date_rng_2), columns=['date'])
df3

Unnamed: 0,date
0,2018-06-01
1,2018-06-02
2,2018-06-03


## Parse on timestamp index

In [180]:
df2

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2019-10-01 00:00:00,13
2019-10-01 01:00:00,51
2019-10-01 02:00:00,83
2019-10-01 03:00:00,26
2019-10-01 04:00:00,53
...,...
2019-10-17 20:00:00,80
2019-10-17 21:00:00,98
2019-10-17 22:00:00,37
2019-10-17 23:00:00,73


In [184]:
df2.index.day

Int64Index([1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
            ...
            3, 3, 3, 3, 3, 3, 3, 3, 3, 4],
           dtype='int64', name='date', length=409)

In [185]:
df2[df2.index.day == 8]

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2019-10-08 00:00:00,17
2019-10-08 01:00:00,30
2019-10-08 02:00:00,78
2019-10-08 03:00:00,9
2019-10-08 04:00:00,22
2019-10-08 05:00:00,49
2019-10-08 06:00:00,97
2019-10-08 07:00:00,10
2019-10-08 08:00:00,28
2019-10-08 09:00:00,74


In [187]:
df2.loc['2019-10-05']

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2019-10-05 00:00:00,64
2019-10-05 01:00:00,44
2019-10-05 02:00:00,22
2019-10-05 03:00:00,55
2019-10-05 04:00:00,57
2019-10-05 05:00:00,55
2019-10-05 06:00:00,72
2019-10-05 07:00:00,14
2019-10-05 08:00:00,71
2019-10-05 09:00:00,97


In [188]:
df2.loc['2019-10-04':'2019-10-05']

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2019-10-04 00:00:00,79
2019-10-04 01:00:00,71
2019-10-04 02:00:00,24
2019-10-04 03:00:00,84
2019-10-04 04:00:00,80
2019-10-04 05:00:00,34
2019-10-04 06:00:00,70
2019-10-04 07:00:00,7
2019-10-04 08:00:00,87
2019-10-04 09:00:00,54


## How to get daily average?

### `pd.resample`: convenience method for frequency conversion and resampling of time series.

In [189]:
df2

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2019-10-01 00:00:00,13
2019-10-01 01:00:00,51
2019-10-01 02:00:00,83
2019-10-01 03:00:00,26
2019-10-01 04:00:00,53
...,...
2019-10-17 20:00:00,80
2019-10-17 21:00:00,98
2019-10-17 22:00:00,37
2019-10-17 23:00:00,73


In [193]:
df2.resample('w').mean()

Unnamed: 0_level_0,data
date,Unnamed: 1_level_1
2019-10-06,52.986111
2019-10-13,48.577381
2019-10-20,48.237113


In [194]:
df2['rolling_sum'] = df2.rolling(4).sum()
df2.head(10)

Unnamed: 0_level_0,data,rolling_sum
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-10-01 00:00:00,13,
2019-10-01 01:00:00,51,
2019-10-01 02:00:00,83,
2019-10-01 03:00:00,26,173.0
2019-10-01 04:00:00,53,213.0
2019-10-01 05:00:00,91,253.0
2019-10-01 06:00:00,21,191.0
2019-10-01 07:00:00,83,248.0
2019-10-01 08:00:00,22,217.0
2019-10-01 09:00:00,77,203.0


In [195]:
df2['rolling_sum_backfilled'] = df2['rolling_sum'].fillna(method='backfill')
df2.head(10)

Unnamed: 0_level_0,data,rolling_sum,rolling_sum_backfilled
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-01 00:00:00,13,,173.0
2019-10-01 01:00:00,51,,173.0
2019-10-01 02:00:00,83,,173.0
2019-10-01 03:00:00,26,173.0,173.0
2019-10-01 04:00:00,53,213.0,213.0
2019-10-01 05:00:00,91,253.0,253.0
2019-10-01 06:00:00,21,191.0,191.0
2019-10-01 07:00:00,83,248.0,248.0
2019-10-01 08:00:00,22,217.0,217.0
2019-10-01 09:00:00,77,203.0,203.0


In [199]:
rng = pd.date_range('1/1/2012', periods=700, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.head(10)

2012-01-01 00:00:00    168
2012-01-01 00:00:01    473
2012-01-01 00:00:02    408
2012-01-01 00:00:03    249
2012-01-01 00:00:04    430
2012-01-01 00:00:05    436
2012-01-01 00:00:06     95
2012-01-01 00:00:07    470
2012-01-01 00:00:08    231
2012-01-01 00:00:09    148
Freq: S, dtype: int64

In [213]:
ts.resample('5S', closed='left', loffset='5min').sum().head()

2012-01-01 00:05:00    1728
2012-01-01 00:05:05    1380
2012-01-01 00:05:10    1010
2012-01-01 00:05:15     997
2012-01-01 00:05:20    1167
Freq: 5S, dtype: int64

In [205]:
ts.resample('5S', closed='right').sum().head()

2011-12-31 23:59:55     168
2012-01-01 00:00:00    1996
2012-01-01 00:00:05    1235
2012-01-01 00:00:10    1168
2012-01-01 00:00:15     890
Freq: 5S, dtype: int64

## Divide a given date into features – `pd.Series.dt`

* `pandas.Series.dt.year` returns the year of the date time.
* `pandas.Series.dt.month` returns the month of the date time.
* `pandas.Series.dt.day` returns the day of the date time.
* `pandas.Series.dt.hour` returns the hour of the date time.
* `pandas.Series.dt.minute` returns the minute of the date time.



In [207]:
s = pd.date_range('2019-01-01', '2019-01-05').to_series()
s

2019-01-01   2019-01-01
2019-01-02   2019-01-02
2019-01-03   2019-01-03
2019-01-04   2019-01-04
2019-01-05   2019-01-05
Freq: D, dtype: datetime64[ns]

In [208]:
# Create date and time with dataframe 
rng = pd.DataFrame() 
rng['date'] = pd.date_range('1/1/2019', periods = 72, freq ='H') 
rng

Unnamed: 0,date
0,2019-01-01 00:00:00
1,2019-01-01 01:00:00
2,2019-01-01 02:00:00
3,2019-01-01 03:00:00
4,2019-01-01 04:00:00
...,...
67,2019-01-03 19:00:00
68,2019-01-03 20:00:00
69,2019-01-03 21:00:00
70,2019-01-03 22:00:00


In [209]:
# Print the dates in dd-mm-yy format 
rng[:5] 

# Create features for year, month, day, hour, and minute 
rng['year'] = rng['date'].dt.year 
rng['month'] = rng['date'].dt.month 
rng['day'] = rng['date'].dt.day 
rng['hour'] = rng['date'].dt.hour 
rng['minute'] = rng['date'].dt.minute 

# Print the dates divided into features 
rng[:3]

Unnamed: 0,date
0,2019-01-01 00:00:00
1,2019-01-01 01:00:00
2,2019-01-01 02:00:00
3,2019-01-01 03:00:00
4,2019-01-01 04:00:00


Unnamed: 0,date,year,month,day,hour,minute
0,2019-01-01 00:00:00,2019,1,1,0,0
1,2019-01-01 01:00:00,2019,1,1,1,0
2,2019-01-01 02:00:00,2019,1,1,2,0
