In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 7)
pd.set_option('precision', 7)

# useful for date/time manipulations
import datetime
from datetime import datetime

# And some items for matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
pd.options.display.mpl_style = 'default'

mpl_style had been deprecated and will be removed in a future version.
Use `matplotlib.pyplot.style.use` instead.

  exec(code_obj, self.user_global_ns, self.user_ns)


# Time-series data and the DatetimeIndex

In [2]:
# create a a DatetimeIndex from an array of datetime's
dates = [datetime(2014, 8, 1), datetime(2014, 8, 2)]
dti = pd.DatetimeIndex(dates)
dti

DatetimeIndex(['2014-08-01', '2014-08-02'], dtype='datetime64[ns]', freq=None)

In [3]:
# a Series given a datetime list will automatically create
# a DatetimeIndex as its index
np.random.seed(123456)
ts = pd.Series(np.random.randn(2), dates)
type(ts.index)

pandas.tseries.index.DatetimeIndex

In [4]:
# retrieve a value using a datetime object
ts[datetime(2014, 8, 2)]

-0.28286334432866328

In [5]:
# this can also be performed with a string
ts['2014-8-2']

-0.28286334432866328

In [6]:
# create a Series with a DatetimeIndex using strings as dates
np.random.seed(123456)
dates = ['2014-08-01', '2014-08-02']
ts = pd.Series(np.random.randn(2), dates)
ts

2014-08-01    0.4691123
2014-08-02   -0.2828633
dtype: float64

In [7]:
# convert a list of items to a DatetimeIndex
dti = pd.to_datetime(['Aug 1, 2014', '2014-08-02', 
                      '2014.8.3', None])
dti

DatetimeIndex(['2014-08-01', '2014-08-02', '2014-08-03', 'NaT'], dtype='datetime64[ns]', freq=None)

In [8]:
# watch out as a failure to convert an item on the list
# to a date/time will result in the return value being a
# NumPy array instead of a DatetimeIndex
dti2 = pd.to_datetime(['Aug 1, 2014', 'foo'])
type(dti2)

ValueError: Unknown string format

In [9]:
# coerce pandas to convert all to datetime and a DatetimeIndex
# substituting NaT where values can not be converted
pd.to_datetime(['Aug 1, 2014', 'foo'], coerce=True)

  This is separate from the ipykernel package so we can avoid doing imports until


DatetimeIndex(['2014-08-01', 'NaT'], dtype='datetime64[ns]', freq=None)

In [10]:
# demonstrate two representations of the same date, one 
# month first, the other day first, converting to the 
# same date representation in pandas
dti1 = pd.to_datetime(['8/1/2014'])
dti2 = pd.to_datetime(['1/8/2014'], dayfirst=True)
dti1[0], dti2[0]

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-01 00:00:00'))

In [12]:
# create a Series with a DatetimeIndex starting at 8/1/2014
# and consisting of 10 consequtive days
np.random.seed(123456)
dates = pd.date_range('8/1/2014', periods=10)
s1 = pd.Series(np.random.randn(10), dates)
s1[:5]

2014-08-01    0.4691123
2014-08-02   -0.2828633
2014-08-03   -1.5090585
2014-08-04   -1.1356324
2014-08-05    1.2121120
Freq: D, dtype: float64

In [13]:
# for examples of data retrieval / slicing, we will use the 
# following data from Yahoo! Finance
import pandas.io.data as web
msft = web.DataReader("MSFT", 'yahoo', '2012-1-1', '2013-12-30')
msft.head(5)

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


                 Open       High        Low      Close    Volume  Adj Close
Date                                                                       
2012-01-03  26.549999  26.959999  26.389999  26.770000  64731500  23.461752
2012-01-04  26.820000  27.469999  26.780001  27.400000  80516100  24.013895
2012-01-05  27.379999  27.730000  27.290001  27.680000  56081400  24.259293
2012-01-06  27.530001  28.190001  27.530001  28.110001  99455500  24.636154
2012-01-09  28.049999  28.100000  27.719999  27.740000  59706800  24.311878

In [14]:
# extract just the Adj Close values
msftAC = msft['Adj Close']
msftAC.head(3)

Date
2012-01-03    23.461752
2012-01-04    24.013895
2012-01-05    24.259293
Name: Adj Close, dtype: float64

In [15]:
# slicing using a DatetimeIndex nicely works with dates 
# passed as strings
msft['2012-01-01':'2012-01-05']

                 Open       High        Low  Close    Volume  Adj Close
Date                                                                   
2012-01-03  26.549999  26.959999  26.389999  26.77  64731500  23.461752
2012-01-04  26.820000  27.469999  26.780001  27.40  80516100  24.013895
2012-01-05  27.379999  27.730000  27.290001  27.68  56081400  24.259293

In [16]:
# returns a Series representing all the values of the 
# single row indexed by the column names
msft.loc['2012-01-03']

Open         2.6549999e+01
High         2.6959999e+01
Low          2.6389999e+01
Close        2.6770000e+01
Volume       6.4731500e+07
Adj Close    2.3461752e+01
Name: 2012-01-03 00:00:00, dtype: float64

In [17]:
# this is an error as this tries to retrieve a column
# named '2012-01-03'
# msft['2012-01-03'] # commented to prevent killing the notebook

In [18]:
# this is a Series, so the lookup works
msftAC['2012-01-03']

23.461752000000001

In [19]:
# we can lookup using partial date specifications
# such as only year and month
msft['2012-02'].head(5)

                 Open       High        Low      Close    Volume  Adj Close
Date                                                                       
2012-02-01  29.790001  30.049999  29.760000  29.889999  67409900  26.196180
2012-02-02  29.900000  30.170000  29.709999  29.950001  52223300  26.248766
2012-02-03  30.139999  30.400000  30.090000  30.240000  41838500  26.502927
2012-02-06  30.040001  30.219999  29.969999  30.200001  28039700  26.467871
2012-02-07  30.150000  30.490000  30.049999  30.350000  39242400  26.599334

In [20]:
# slice starting at the beginning of Feb 2012 and 
# end on Feb 9 2012
msft['2012-02':'2012-02-09'][:5]

                 Open       High        Low      Close    Volume  Adj Close
Date                                                                       
2012-02-01  29.790001  30.049999  29.760000  29.889999  67409900  26.196180
2012-02-02  29.900000  30.170000  29.709999  29.950001  52223300  26.248766
2012-02-03  30.139999  30.400000  30.090000  30.240000  41838500  26.502927
2012-02-06  30.040001  30.219999  29.969999  30.200001  28039700  26.467871
2012-02-07  30.150000  30.490000  30.049999  30.350000  39242400  26.599334

# Creating time-series with specific frequencies

In [21]:
# create a time-series with one minute frequency
bymin = pd.Series(np.arange(0, 90*60*24),
                  pd.date_range('2014-08-01', 
                                '2014-10-29 23:59:00',
                                freq='T'))
bymin

2014-08-01 00:00:00         0
2014-08-01 00:01:00         1
2014-08-01 00:02:00         2
                        ...  
2014-10-29 23:57:00    129597
2014-10-29 23:58:00    129598
2014-10-29 23:59:00    129599
Freq: T, dtype: int64

In [22]:
# slice at the minute level
bymin['2014-08-01 12:30':'2014-08-01 12:59']

2014-08-01 12:30:00    750
2014-08-01 12:31:00    751
2014-08-01 12:32:00    752
                      ... 
2014-08-01 12:57:00    777
2014-08-01 12:58:00    778
2014-08-01 12:59:00    779
Freq: T, dtype: int64

# Representing intervals of time using periods

In [23]:
# create a period representing a start of 
# 2014-08 and for a duration of one month
aug2014 = pd.Period('2014-08', freq='M')
aug2014

Period('2014-08', 'M')

In [24]:
# pandas determined the following start and end
# for the period
aug2014.start_time, aug2014.end_time

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-31 23:59:59.999999999'))

In [25]:
# what is the one month period following the given period?
sep2014 = aug2014 + 1
sep2014

Period('2014-09', 'M')

In [26]:
# the calculated start and end are
sep2014.start_time, sep2014.end_time

(Timestamp('2014-09-01 00:00:00'), Timestamp('2014-09-30 23:59:59.999999999'))

In [27]:
# create a pandas PeriodIndex
mp2013 = pd.period_range('1/1/2013', '12/31/2013', freq='M')
mp2013

PeriodIndex(['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
             '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'],
            dtype='int64', freq='M')

In [28]:
# dump all the calculated periods
for p in mp2013: 
    print "{0} {1} {2} {3}".format(p, 
                                   p.freq, 
                                   p.start_time, 
                                   p.end_time)

2013-01 <MonthEnd> 2013-01-01 00:00:00 2013-01-31 23:59:59.999999999
2013-02 <MonthEnd> 2013-02-01 00:00:00 2013-02-28 23:59:59.999999999
2013-03 <MonthEnd> 2013-03-01 00:00:00 2013-03-31 23:59:59.999999999
2013-04 <MonthEnd> 2013-04-01 00:00:00 2013-04-30 23:59:59.999999999
2013-05 <MonthEnd> 2013-05-01 00:00:00 2013-05-31 23:59:59.999999999
2013-06 <MonthEnd> 2013-06-01 00:00:00 2013-06-30 23:59:59.999999999
2013-07 <MonthEnd> 2013-07-01 00:00:00 2013-07-31 23:59:59.999999999
2013-08 <MonthEnd> 2013-08-01 00:00:00 2013-08-31 23:59:59.999999999
2013-09 <MonthEnd> 2013-09-01 00:00:00 2013-09-30 23:59:59.999999999
2013-10 <MonthEnd> 2013-10-01 00:00:00 2013-10-31 23:59:59.999999999
2013-11 <MonthEnd> 2013-11-01 00:00:00 2013-11-30 23:59:59.999999999
2013-12 <MonthEnd> 2013-12-01 00:00:00 2013-12-31 23:59:59.999999999


In [29]:
# and now create a Series using the PeriodIndex
np.random.seed(123456)
ps = pd.Series(np.random.randn(12), mp2013)
ps

2013-01    0.4691123
2013-02   -0.2828633
2013-03   -1.5090585
             ...    
2013-10   -2.1045692
2013-11   -0.4949293
2013-12    1.0718038
Freq: M, dtype: float64

# Shifting and lagging time-series data

In [30]:
# refresh our memory on the data in the MSFT closing prices Series
msftAC[:5]

Date
2012-01-03    23.461752
2012-01-04    24.013895
2012-01-05    24.259293
2012-01-06    24.636154
2012-01-09    24.311878
Name: Adj Close, dtype: float64

In [31]:
# shift the prices one index position forward
shifted_forward = msftAC.shift(1)
shifted_forward[:5]

Date
2012-01-03          NaN
2012-01-04    23.461752
2012-01-05    24.013895
2012-01-06    24.259293
2012-01-09    24.636154
Name: Adj Close, dtype: float64

In [32]:
# the last item is also shifted away 
msftAC.tail(5), shifted_forward.tail(5)

(Date
 2013-12-23    34.000757
 2013-12-24    34.427858
 2013-12-26    34.762106
 2013-12-27    34.622837
 2013-12-30    34.622837
 Name: Adj Close, dtype: float64, Date
 2013-12-23    34.167882
 2013-12-24    34.000757
 2013-12-26    34.427858
 2013-12-27    34.762106
 2013-12-30    34.622837
 Name: Adj Close, dtype: float64)

In [33]:
# shift backwards 2 index labels
shifted_backwards = msftAC.shift(-2)
shifted_backwards[:5]

Date
2012-01-03    24.259293
2012-01-04    24.636154
2012-01-05    24.311878
2012-01-06    24.399520
2012-01-09    24.294349
Name: Adj Close, dtype: float64

In [34]:
# this has resulted in 2 NaN values at 
# the end of the resulting Series
shifted_backwards.tail(5)

Date
2013-12-23    34.762106
2013-12-24    34.622837
2013-12-26    34.622837
2013-12-27          NaN
2013-12-30          NaN
Name: Adj Close, dtype: float64

In [35]:
# shift by a different frequency does not realign
# and ends up essentially changing the index labels by
# the specific amount of time
msftAC.shift(1, freq="S")

Date
2012-01-03 00:00:01    23.461752
2012-01-04 00:00:01    24.013895
2012-01-05 00:00:01    24.259293
                         ...    
2013-12-26 00:00:01    34.762106
2013-12-27 00:00:01    34.622837
2013-12-30 00:00:01    34.622837
Name: Adj Close, dtype: float64

In [36]:
# resulting Series has one day added to all index labels
msftAC.tshift(1, freq="D")

Date
2012-01-04    23.461752
2012-01-05    24.013895
2012-01-06    24.259293
                ...    
2013-12-27    34.762106
2013-12-28    34.622837
2013-12-31    34.622837
Name: Adj Close, dtype: float64

In [37]:
# calculate the percentage change in closing price
msftAC / msftAC.shift(1) - 1

Date
2012-01-03          NaN
2012-01-04    0.0235337
2012-01-05    0.0102190
                ...    
2013-12-26    0.0097086
2013-12-27   -0.0040063
2013-12-30    0.0000000
Name: Adj Close, dtype: float64

# Frequency conversion of time-series data

In [38]:
# take a two item sample of the msftAC data for demonstrations
sample = msftAC[:2]
sample

Date
2012-01-03    23.461752
2012-01-04    24.013895
Name: Adj Close, dtype: float64

In [39]:
# demonstrate resampling to hour intervals
# realignment causes many NaN's
sample.asfreq("H")

Date
2012-01-03 00:00:00    23.461752
2012-01-03 01:00:00          NaN
2012-01-03 02:00:00          NaN
                         ...    
2012-01-03 22:00:00          NaN
2012-01-03 23:00:00          NaN
2012-01-04 00:00:00    24.013895
Freq: H, Name: Adj Close, dtype: float64

In [40]:
# fill NaN's with the last know non-NaN valuen
sample.asfreq("H", method="ffill")

Date
2012-01-03 00:00:00    23.461752
2012-01-03 01:00:00    23.461752
2012-01-03 02:00:00    23.461752
                         ...    
2012-01-03 22:00:00    23.461752
2012-01-03 23:00:00    23.461752
2012-01-04 00:00:00    24.013895
Freq: H, Name: Adj Close, dtype: float64

In [41]:
# fill with the "next known" value
sample.asfreq("H", method="bfill")

Date
2012-01-03 00:00:00    23.461752
2012-01-03 01:00:00    24.013895
2012-01-03 02:00:00    24.013895
                         ...    
2012-01-03 22:00:00    24.013895
2012-01-03 23:00:00    24.013895
2012-01-04 00:00:00    24.013895
Freq: H, Name: Adj Close, dtype: float64

## Up and down resampling of time-series

In [42]:
# calculate the cumulative daily returns for MSFT
msft_cum_ret = (1 + (msftAC / msftAC.shift() - 1)).cumprod()
msft_cum_ret

Date
2012-01-03          NaN
2012-01-04    1.0235337
2012-01-05    1.0339932
                ...    
2013-12-26    1.4816500
2013-12-27    1.4757140
2013-12-30    1.4757140
Name: Adj Close, dtype: float64

In [43]:
# resample to a monthly cumulative return
msft_monthly_cum_ret = msft_cum_ret.resample("M")
msft_monthly_cum_ret

use .resample(...).mean() instead of .resample(...)
  return getattr(obj, attr, default)


DatetimeIndexResampler [freq=<MonthEnd>, axis=0, closed=right, label=right, convention=start, base=0]

In [44]:
# verify the monthly average for 2012-01
msft_cum_ret['2012-01'].mean()

1.0686746375884233

In [45]:
# verify that the default resample techique is mean
msft_cum_ret.resample("M", how="mean")

the new syntax is .resample(...).mean()
  from ipykernel import kernelapp as app


Date
2012-01-31    1.0686746
2012-02-29    1.1556975
2012-03-31    1.2105696
                ...    
2013-10-31    1.3503983
2013-11-30    1.4719147
2013-12-31    1.4823624
Freq: M, Name: Adj Close, dtype: float64

In [46]:
# resample to monthly and give us open, high, low, close
msft_cum_ret.resample("M", how="ohlc")[:5]

the new syntax is .resample(...).ohlc()
  from ipykernel import kernelapp as app


                 open       high        low      close
Date                                                  
2012-01-31  1.0235337  1.1105715  1.0235337  1.1031005
2012-02-29  1.1165483  1.1983493  1.1165483  1.1934610
2012-03-31  1.2141417  1.2351983  1.1866928  1.2130136
2012-04-30  1.2141417  1.2190298  1.1411955  1.2039894
2012-05-31  1.2036133  1.2036133  1.0998598  1.1047800

In [47]:
# this will return an index with periods instead of timestamps
by_periods = msft_cum_ret.resample("M", how="mean", kind="period")
for i in by_periods.index[:5]: 
    print ("{0}:{1} {2}".format(i.start_time, 
                                i.end_time, 
                                by_periods[i]))

2012-01-01 00:00:00:2012-01-31 23:59:59.999999999 1.06867463759
2012-02-01 00:00:00:2012-02-29 23:59:59.999999999 1.1556974688
2012-03-01 00:00:00:2012-03-31 23:59:59.999999999 1.21056958615
2012-04-01 00:00:00:2012-04-30 23:59:59.999999999 1.18464359141
2012-05-01 00:00:00:2012-05-31 23:59:59.999999999 1.14051593225


the new syntax is .resample(...).mean()
  from ipykernel import kernelapp as app


In [48]:
# upsampling will be demonstrated using the second
# and third values (first is NaN)
sample = msft_cum_ret[1:3]
sample

Date
2012-01-04    1.0235337
2012-01-05    1.0339932
Name: Adj Close, dtype: float64

In [49]:
# upsampling this will have a lot of NaN's
by_hour = sample.resample("H")
by_hour

DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, label=left, convention=start, base=0]

In [50]:
by_hour.interpolate()

Date
2012-01-04 00:00:00    1.0235337
2012-01-04 01:00:00    1.0239696
2012-01-04 02:00:00    1.0244054
                         ...    
2012-01-04 22:00:00    1.0331216
2012-01-04 23:00:00    1.0335574
2012-01-05 00:00:00    1.0339932
Freq: H, Name: Adj Close, dtype: float64

    Always check for edge cases and corner cases when you are coding and try to provide a bug free solution.
    Be prepared to improve your solution when asked.
    Make sure to practice the core CS concepts like arrays, linked list, stacks, queues, hash maps, binary trees and graphs, searching and sorting, recursion and parity. 

In [11]:
#linked lists
# http://www.openbookproject.net/thinkcs/python/english2e/ch18.html

#As usual when writing a new class, we’ll start with the initialization and __str__ methods so that we can test the basic mechanism of creating and displaying the new type:

class Node:
    def __init__(self, cargo=None, next=None):
        self.cargo = cargo
        self.next  = next

    def __str__(self):
        return str(self.cargo)


In [14]:
node = Node("test")
print(node)

test


In [15]:
node1 = Node(1)
node2 = Node(2)
node3 = Node(3)

In [16]:
node1.next = node2

In [17]:
node2.next = node3

In [20]:
def print_list(node):
    while node:
        print(node,
        node = node.next)
    print
    

In [21]:
print_list(node1)

TypeError: 'node' is an invalid keyword argument for this function

In [23]:
# lists and recursion
def print_backward(list):
    if list == None: return
    head = list
    tail = list.next
    print_backward(tail)
    print(head,)

In [24]:
print_backward(node1)

3
2
1


In [25]:
#### Stacks 
# http://www.openbookproject.net/thinkcs/python/english2e/ch19.html

#an implementation of the Stack ADT that uses a Python list:

class Stack :
    def __init__(self):
        self.items = []

    def push(self, item):
        self.items.append(item)

    def pop(self):
        return self.items.pop()

    def is_empty(self):
        return (self.items == [])


In [26]:
#Pushing and popping
#A stack is a generic data structure, which means that we can add any type of item to it. The following example pushes two integers and a string onto the stack:

s = Stack()

In [27]:
s.push(54)

In [32]:
s.push(45)

In [29]:
s.push("+")

In [31]:
while not s.is_empty():
    print(s.pop(),)

+
45
54
