## Using pandas to read datetime object

read_csv() function
- Can read strings to datetime objects
- Need to specify 'parse_dates=True'

In [15]:
import pandas as pd

In [16]:
sales = pd.read_csv('sales-feb-2015.csv',
                parse_dates=True, index_col='Date')

In [17]:
sales

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14
2015-02-11 20:00:00,Initech,Software,7
2015-02-11 23:00:00,Hooli,Software,4


In [18]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2015-02-02 08:30:00 to 2015-02-11 23:00:00
Data columns (total 3 columns):
Company    7 non-null object
Product    7 non-null object
Units      7 non-null int64
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes


### Selecting single datetime

In [19]:
sales.loc['2015-02-04 22:00:00', 'Company']

'Acme Coporation'

### Selecting whole day

In [20]:
sales.loc['2015-02-04']

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14


## Partial datetime string selection
- Alternative formats: - sales.loc['February 5, 2015] - sales.loc['2015-Feb-5']
- Whole month: sales.loc['2015-2']
- Whole year: sales.loc['2015']

### Selecting whole month

In [21]:
sales.loc['2015-02']

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14
2015-02-11 20:00:00,Initech,Software,7
2015-02-11 23:00:00,Hooli,Software,4


### Slicing using dates/times

In [22]:
sales.loc['2015-02-03':'2015-02-04']

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14


### Convert strings to datetime

In [23]:
evening_2_11 = pd.to_datetime(['2015-2-11 20:00',
                               '2015-2-11 21:00',
                               '2015-2-11 22:00',
                               '2015-2-11 23:00'])

In [24]:
evening_2_11

DatetimeIndex(['2015-02-11 20:00:00', '2015-02-11 21:00:00',
               '2015-02-11 22:00:00', '2015-02-11 23:00:00'],
              dtype='datetime64[ns]', freq=None)

### Reindexing DataFrame

In [25]:
sales.reindex(evening_2_11)

Unnamed: 0,Company,Product,Units
2015-02-11 20:00:00,Initech,Software,7.0
2015-02-11 21:00:00,,,
2015-02-11 22:00:00,,,
2015-02-11 23:00:00,Hooli,Software,4.0


### Filling missing values

In [27]:
sales.reindex(evening_2_11, method='ffill')

Unnamed: 0,Company,Product,Units
2015-02-11 20:00:00,Initech,Software,7
2015-02-11 21:00:00,Initech,Software,7
2015-02-11 22:00:00,Initech,Software,7
2015-02-11 23:00:00,Hooli,Software,4


In [28]:
sales.reindex(evening_2_11, method='bfill')

Unnamed: 0,Company,Product,Units
2015-02-11 20:00:00,Initech,Software,7
2015-02-11 21:00:00,Hooli,Software,4
2015-02-11 22:00:00,Hooli,Software,4
2015-02-11 23:00:00,Hooli,Software,4


# Resampling time series data

In [29]:
import pandas as pd

In [30]:
sales = pd.read_csv('sales-feb-2015.csv',
                   parse_dates=True,
                   index_col='Date')

In [31]:
sales.head()

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-02 08:30:00,Hooli,Software,3
2015-02-02 21:00:00,Mediacore,Hardware,9
2015-02-03 14:00:00,Initech,Software,13
2015-02-04 15:30:00,Streeplex,Software,13
2015-02-04 22:00:00,Acme Coporation,Hardware,14


In [32]:
sales.mean()

Units    9.0
dtype: float64

In [33]:
sales.sum()

Company    HooliMediacoreInitechStreeplexAcme CoporationI...
Product    SoftwareHardwareSoftwareSoftwareHardwareSoftwa...
Units                                                     63
dtype: object

In [34]:
sales.count()

Company    7
Product    7
Units      7
dtype: int64

### Aggregating means

In [35]:
daily_mean = sales.resample('D').mean()

In [36]:
daily_mean

Unnamed: 0_level_0,Units
Date,Unnamed: 1_level_1
2015-02-02,6.0
2015-02-03,13.0
2015-02-04,13.5
2015-02-05,
2015-02-06,
2015-02-07,
2015-02-08,
2015-02-09,
2015-02-10,
2015-02-11,5.5


### Verifying

In [37]:
daily_mean.loc['2015-02-02']

Units    6.0
Name: 2015-02-02 00:00:00, dtype: float64

In [38]:
sales.loc['2015-02-02', 'Units']

Date
2015-02-02 08:30:00    3
2015-02-02 21:00:00    9
Name: Units, dtype: int64

In [40]:
sales.loc['2015-02-02', 'Units'].mean()

6.0

### Method chaining

In [41]:
sales.resample('D').sum()

Unnamed: 0_level_0,Units
Date,Unnamed: 1_level_1
2015-02-02,12.0
2015-02-03,13.0
2015-02-04,27.0
2015-02-05,
2015-02-06,
2015-02-07,
2015-02-08,
2015-02-09,
2015-02-10,
2015-02-11,11.0


In [42]:
sales.resample('D').sum().max()

Units    27.0
dtype: float64

### Resampling strings

In [43]:
sales.resample('W').count()

Unnamed: 0_level_0,Company,Product,Units
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-02-08,5,5,5
2015-02-15,2,2,2


### Multiplying frequencies

In [44]:
sales.loc[:, 'Units'].resample('2W').sum()

Date
2015-02-08    52
2015-02-22    11
Freq: 2W-SUN, Name: Units, dtype: int64

### Upsampling

In [45]:
two_days = sales.loc['2015-02-04': '2015-02-05', 'Units']

In [46]:
two_days

Date
2015-02-04 15:30:00    13
2015-02-04 22:00:00    14
Name: Units, dtype: int64

### Upsampling and filling

In [47]:
two_days.resample('4H').ffill()

Date
2015-02-04 12:00:00     NaN
2015-02-04 16:00:00    13.0
2015-02-04 20:00:00    13.0
Freq: 4H, Name: Units, dtype: float64

# Manipulating time series data

In [48]:
import pandas as pd

In [51]:
sales = pd.read_csv('sales-feb-2015.csv',
                   parse_dates=['Date'])

### String methods

In [52]:
sales['Company'].str.upper()

0              HOOLI
1          MEDIACORE
2            INITECH
3          STREEPLEX
4    ACME COPORATION
5            INITECH
6              HOOLI
Name: Company, dtype: object

### Substring matching

In [53]:
sales['Product'].str.contains('ware')

0    True
1    True
2    True
3    True
4    True
5    True
6    True
Name: Product, dtype: bool

### Boolean arithmetic

In [54]:
True + False

1

In [55]:
True + True

2

In [56]:
False + False

0

### Boolean reduction

In [59]:
sales['Product'].str.contains('ware').sum()

7

### Datetime methods

In [60]:
sales['Date'].dt.hour

0     8
1    21
2    14
3    15
4    22
5    20
6    23
Name: Date, dtype: int64

### Set timezone

In [61]:
central = sales['Date'].dt.tz_localize('US/Central')

In [62]:
central

0   2015-02-02 08:30:00-06:00
1   2015-02-02 21:00:00-06:00
2   2015-02-03 14:00:00-06:00
3   2015-02-04 15:30:00-06:00
4   2015-02-04 22:00:00-06:00
5   2015-02-11 20:00:00-06:00
6   2015-02-11 23:00:00-06:00
Name: Date, dtype: datetime64[ns, US/Central]

### Convert timezone

In [64]:
central.dt.tz_convert('US/Eastern')

0   2015-02-02 09:30:00-05:00
1   2015-02-02 22:00:00-05:00
2   2015-02-03 15:00:00-05:00
3   2015-02-04 16:30:00-05:00
4   2015-02-04 23:00:00-05:00
5   2015-02-11 21:00:00-05:00
6   2015-02-12 00:00:00-05:00
Name: Date, dtype: datetime64[ns, US/Eastern]

### Method chaining

In [66]:
sales['Date'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern')

0   2015-02-02 09:30:00-05:00
1   2015-02-02 22:00:00-05:00
2   2015-02-03 15:00:00-05:00
3   2015-02-04 16:30:00-05:00
4   2015-02-04 23:00:00-05:00
5   2015-02-11 21:00:00-05:00
6   2015-02-12 00:00:00-05:00
Name: Date, dtype: datetime64[ns, US/Eastern]

## World population

In [68]:
population = pd.read_csv('world_population.csv', 
                        parse_dates=True,
                        index_col='Year')

### Upsample population

In [70]:
population.resample('A').first().head()

Unnamed: 0_level_0,Total Population
Year,Unnamed: 1_level_1
1960-12-31,3034971000.0
1961-12-31,
1962-12-31,
1963-12-31,
1964-12-31,


### Interpolate missing data

In [72]:
population.resample('A').first().interpolate('linear').head()

Unnamed: 0_level_0,Total Population
Year,Unnamed: 1_level_1
1960-12-31,3034971000.0
1961-12-31,3099956000.0
1962-12-31,3164941000.0
1963-12-31,3229926000.0
1964-12-31,3294911000.0


# Time series visualization

## Topics
- Line types
- Plot types
- Subplots

## S&P 500 Data

In [73]:
import pandas as pd

In [74]:
import matplotlib.pyplot as plt

In [None]:
sp500 = pd.read_csv('sp500.csv', parse_dates=True,
                   index_col='Date')

In [None]:
sp500.head()

In [None]:
sp500['Close'].plot()

In [None]:
plt.show()

In [None]:
sp500['Close'].plot(title='S&P 500')

In [None]:
plt.ylabel('Closing Price (US Dollars)')

In [None]:
plt.show()

### One week

In [None]:
sp500.loc['2012-04-01':'2012-04-07', 'Close'].plot(title='S&P 500')

In [None]:
plt.ylabel('C')