# Data

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
plt.style.use('ggplot')

## Working with Series

In [5]:
x = Series(range(5,10))

In [6]:
x

0    5
1    6
2    7
3    8
4    9
dtype: int64

### We can treat Series objects much like numpy vectors

In [7]:
x.sum(), x.mean(), x.std()

(35, 7.0, 1.5811388300841898)

In [8]:
x**2

0    25
1    36
2    49
3    64
4    81
dtype: int64

In [9]:
x[x >= 8]

3    8
4    9
dtype: int64

### Series can also contain more information than numpy vectors

In [11]:
# You can always use standard positional indexing
x[1:4]

1    6
2    7
3    8
dtype: int64

## Series index
### But you can also assign labeled indexes.

In [12]:
x.index = list('abcde')
x

a    5
b    6
c    7
d    8
e    9
dtype: int64

### Note that with labels, the end index is included

In [13]:
x['a':'c']

a    5
b    6
c    7
dtype: int64

### Even when you have a labeled index, positional arguments still work

In [14]:
x[1:4]

b    6
c    7
d    8
dtype: int64

In [15]:
x.a, x.c, x.e

(5, 7, 9)

## Working with missing data

### Missing data is indicated with NaN (not a number).

In [16]:
y = Series([10, np.nan, np.nan, 13, 14])
y

0    10.0
1     NaN
2     NaN
3    13.0
4    14.0
dtype: float64

### Concatenating two series

In [17]:
z = pd.concat([x, y])
z

a     5.0
b     6.0
c     7.0
d     8.0
e     9.0
0    10.0
1     NaN
2     NaN
3    13.0
4    14.0
dtype: float64

### Reset index to default

In [18]:
z = z.reset_index(drop=True)
z

0     5.0
1     6.0
2     7.0
3     8.0
4     9.0
5    10.0
6     NaN
7     NaN
8    13.0
9    14.0
dtype: float64

### `pandas` aggregate functions ignore missing data

In [19]:
z.sum(), z.mean(), z.std()

(72.0, 9.0, 3.2071349029490928)

### Selecting missing values

In [20]:
z[z.isnull()]

6   NaN
7   NaN
dtype: float64

### Selecting non-missing values

In [21]:
z[z.notnull()]

0     5.0
1     6.0
2     7.0
3     8.0
4     9.0
5    10.0
8    13.0
9    14.0
dtype: float64

### Replacement of missing values

In [22]:
z.fillna(0)

0     5.0
1     6.0
2     7.0
3     8.0
4     9.0
5    10.0
6     0.0
7     0.0
8    13.0
9    14.0
dtype: float64

In [23]:
z.fillna(method='ffill')

0     5.0
1     6.0
2     7.0
3     8.0
4     9.0
5    10.0
6    10.0
7    10.0
8    13.0
9    14.0
dtype: float64

In [24]:
z.fillna(method='bfill')

0     5.0
1     6.0
2     7.0
3     8.0
4     9.0
5    10.0
6    13.0
7    13.0
8    13.0
9    14.0
dtype: float64

In [25]:
z.fillna(z.mean())

0     5.0
1     6.0
2     7.0
3     8.0
4     9.0
5    10.0
6     9.0
7     9.0
8    13.0
9    14.0
dtype: float64

### Working with dates / times

In [26]:
z.index = pd.date_range('01-Jan-2016', periods=len(z))

In [27]:
z

2016-01-01     5.0
2016-01-02     6.0
2016-01-03     7.0
2016-01-04     8.0
2016-01-05     9.0
2016-01-06    10.0
2016-01-07     NaN
2016-01-08     NaN
2016-01-09    13.0
2016-01-10    14.0
Freq: D, dtype: float64

### Intelligent aggregation over datetime ranges

In [30]:
z.resample('W').sum()

2016-01-03    18.0
2016-01-10    54.0
Freq: W-SUN, dtype: float64

### Formatting datetime objects (see http://strftime.org)

In [31]:
z.index.strftime('%b %d, %Y')

Index(['Jan 01, 2016', 'Jan 02, 2016', 'Jan 03, 2016', 'Jan 04, 2016',
       'Jan 05, 2016', 'Jan 06, 2016', 'Jan 07, 2016', 'Jan 08, 2016',
       'Jan 09, 2016', 'Jan 10, 2016'],
      dtype='object')