In [1]:
pip install -U fortran-magic

Requirement already up-to-date: fortran-magic in c:\users\hazael_pc\anaconda3\lib\site-packages (0.7)
Note: you may need to restart the kernel to use updated packages.


In [2]:
%reload_ext fortranmagic

In [3]:
%matplotlib inline
%load_ext fortranmagic

import sys; sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rc('figure', figsize=(12, 7))

ran_the_first_cell = True

jan2017 = pd.to_datetime(['2017-01-03 00:00:00+00:00',
 '2017-01-04 00:00:00+00:00',
 '2017-01-05 00:00:00+00:00',
 '2017-01-06 00:00:00+00:00',
 '2017-01-09 00:00:00+00:00',
 '2017-01-10 00:00:00+00:00',
 '2017-01-11 00:00:00+00:00',
 '2017-01-12 00:00:00+00:00',
 '2017-01-13 00:00:00+00:00',
 '2017-01-17 00:00:00+00:00',
 '2017-01-18 00:00:00+00:00',
 '2017-01-19 00:00:00+00:00',
 '2017-01-20 00:00:00+00:00',
 '2017-01-23 00:00:00+00:00',
 '2017-01-24 00:00:00+00:00',
 '2017-01-25 00:00:00+00:00',
 '2017-01-26 00:00:00+00:00',
 '2017-01-27 00:00:00+00:00',
 '2017-01-30 00:00:00+00:00',
 '2017-01-31 00:00:00+00:00',
 '2017-02-01 00:00:00+00:00'])
calendar = jan2017.values.astype('datetime64[D]')

event_dates = pd.to_datetime(['2017-01-06 00:00:00+00:00', 
                             '2017-01-07 00:00:00+00:00', 
                             '2017-01-08 00:00:00+00:00']).values.astype('datetime64[D]')
event_values = np.array([10, 15, 20])

The fortranmagic extension is already loaded. To reload it, use:
  %reload_ext fortranmagic


<center>
  <h1>The PyData Toolbox</h1>
  <h3>Scott Sanderson (Twitter: @scottbsanderson, GitHub: ssanderson)</h3>
  <h3><a href="https://github.com/ssanderson/pydata-toolbox">https://github.com/ssanderson/pydata-toolbox</a></h3>
</center>

# About Me:

<img src="images/me.jpg" alt="Drawing" style="width: 300px;"/>

- Senior Engineer at [Quantopian](www.quantopian.com)
- Background in Mathematics and Philosophy
- **Twitter:** [@scottbsanderson](https://twitter.com/scottbsanderson)
- **GitHub:** [ssanderson](github.com/ssanderson)

## Outline

- Built-in Data Structures
- Numpy `array`
- Pandas `Series`/`DataFrame`
- Plotting and "Real-World" Analyses

# Data Structures

> Rule 5. Data dominates. If you've chosen the right data structures and organized things well, the algorithms
will almost always be self-evident. Data structures, not algorithms, are central to programming.

- *Notes on Programming in C*, by Rob Pike.

# Lists

In [4]:
assert ran_the_first_cell, "Oh noes!"

In [5]:
l = [1, 'two', 3.0, 4, 5.0, "six"]
l

[1, 'two', 3.0, 4, 5.0, 'six']

In [6]:
lis = ["hello",2,4,3.4,9]
lis

['hello', 2, 4, 3.4, 9]

In [7]:
# Lists can be indexed like C-style arrays.
first = l[0]
second = l[1]
print("first:", first)
print("second:", second)

first: 1
second: two


In [8]:
first = lis[0]
second = lis[1]
print("first:", first)
print("second:", second)

first: hello
second: 2


In [9]:
# Negative indexing gives elements relative to the end of the list.
last = l[-1]
penultimate = l[-2]
print("last:", last)
print("second to last:", penultimate)

last: six
second to last: 5.0


In [10]:

last = lis[-1]
penultimate = lis[-2]
print("last:", last)
print("second to last:", penultimate)

last: 9
second to last: 3.4


In [11]:
# Lists can also be sliced, which makes a copy of elements between 
# start (inclusive) and stop (exclusive)
sublist = l[1:3]
sublist

['two', 3.0]

In [12]:

sublist2 = lis[2:4]
sublist2

[4, 3.4]

In [13]:
# l[:N] is equivalent to l[0:N].
first_three = l[:3]
first_three

[1, 'two', 3.0]

In [14]:

first_three2 = lis[:3]
first_three2

['hello', 2, 4]

In [15]:
# l[3:] is equivalent to l[3:len(l)].
after_three = l[3:]
after_three

[4, 5.0, 'six']

In [16]:

after_three2 = lis[3:]
after_three2

[3.4, 9]

In [17]:
# There's also a third parameter, "step", which gets every Nth element.
l = ['a', 'b', 'c', 'd', 'e', 'f', 'g','h']
l[1:7:2]

['b', 'd', 'f']

In [18]:
lis = [1,2,3,4,5,6,7,8,9,10]
lis[1:9:3]

[2, 5, 8]

In [19]:
# This is a cute way to reverse a list.
l[::-1]

['h', 'g', 'f', 'e', 'd', 'c', 'b', 'a']

In [20]:

lis[::-1]

[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]

In [21]:
# Lists can be grown efficiently (in O(1) amortized time).
l = [1, 2, 3, 4, 5]
print("Before:", l)
l.append('six')
print("After:", l)

Before: [1, 2, 3, 4, 5]
After: [1, 2, 3, 4, 5, 'six']


In [22]:

lis= ["un","dos","tres"]
print("Before:", lis)
lis.append('cuatro')
print("After:", lis)

Before: ['un', 'dos', 'tres']
After: ['un', 'dos', 'tres', 'cuatro']


In [23]:
# Comprehensions let us perform elementwise computations.
l = [1, 2, 3, 4, 5]
[x * 2 for x in l]

[2, 4, 6, 8, 10]

In [24]:
lis = [10,2,3,4,1,20,30]
[x /2 for x in lis]

[5.0, 1.0, 1.5, 2.0, 0.5, 10.0, 15.0]

## Review: Python Lists

- Zero-indexed sequence of arbitrary Python values.
- Slicing syntax: `l[start:stop:step]` copies elements at regular intervals from `start` to `stop`.
- Efficient (`O(1)`) appends and removes from end.
- Comprehension syntax: `[f(x) for x in l if cond(x)]`.

# Dictionaries

In [25]:
# Dictionaries are key-value mappings.
philosophers = {'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}
philosophers

{'David': 'Hume', 'Immanuel': 'Kant', 'Bertrand': 'Russell'}

In [26]:
dicty = {"voc":"a","let":"b","num":1}
dicty

{'voc': 'a', 'let': 'b', 'num': 1}

In [27]:
# Like lists, dictionaries are size-mutable.
philosophers['Ludwig'] = 'Wittgenstein'
philosophers

{'David': 'Hume',
 'Immanuel': 'Kant',
 'Bertrand': 'Russell',
 'Ludwig': 'Wittgenstein'}

In [28]:
dicty['sign'] = '-'
dicty

{'voc': 'a', 'let': 'b', 'num': 1, 'sign': '-'}

In [29]:
del philosophers['David']
philosophers

{'Immanuel': 'Kant', 'Bertrand': 'Russell', 'Ludwig': 'Wittgenstein'}

In [30]:
del dicty['num']
dicty

{'voc': 'a', 'let': 'b', 'sign': '-'}

In [31]:
# No slicing.
print("philosophers['Bertrand':'Immanuel'] > unhashable type: 'slice'")

philosophers['Bertrand':'Immanuel'] > unhashable type: 'slice'


In [32]:
print("dicty['voc':'sign'] > unhashable type: 'slice'")

dicty['voc':'sign'] > unhashable type: 'slice'


## Review: Python Dictionaries

- Unordered key-value mapping from (almost) arbitrary keys to arbitrary values.
- Efficient (`O(1)`) lookup, insertion, and deletion.
- No slicing (would require a notion of order).

<center><img src="images/pacino.gif" alt="Drawing" style="width: 100%;"/></center>


In [33]:
# Suppose we have some matrices...
a = [[1, 2, 3],
     [2, 3, 4],
     [5, 6, 7],
     [1, 1, 1]]

b = [[1, 2, 3, 4],
     [2, 3, 4, 5]]

In [34]:
a2 = [[1, 2, 2],
     [2, 3, 2],
     [5, 2, 3]]

b2 = [[1, 2, 3],
     [1, 2, 4]]

In [35]:
def matmul(A, B):
    """Multiply matrix A by matrix B."""
    rows_out = len(A)
    cols_out = len(B[0])
    out = [[0 for col in range(cols_out)] for row in range(rows_out)]
    
    for i in range(rows_out):
        for j in range(cols_out):
            for k in range(len(B)):
                out[i][j] += A[i][k] * B[k][j]
    return out

<center><img src="images/gross.gif" alt="Drawing" style="width: 50%;"/></center>


In [36]:
%%time

matmul(a, b)

Wall time: 0 ns


[[5, 8, 11, 14], [8, 13, 18, 23], [17, 28, 39, 50], [3, 5, 7, 9]]

In [37]:
%%time

matmul(a2, b2)

Wall time: 0 ns


[[3, 6, 11], [5, 10, 18], [7, 14, 23]]

In [38]:
import random
def random_matrix(m, n):
    out = []
    for row in range(m):
        out.append([random.random() for _ in range(n)])
    return out

randm = random_matrix(2, 3)
randm

[[0.4062880272336713, 0.6843761615190506, 0.10191537471135692],
 [0.5770288388662136, 0.599190983132029, 0.559414963285566]]

In [39]:
randm2 = random_matrix(3, 3)
randm2

[[0.9126752243791495, 0.787349187684806, 0.7567479229009],
 [0.03064680606691117, 0.8888822373710992, 0.6528569718993195],
 [0.847556358022181, 0.9501800677488107, 0.5498638870663264]]

In [40]:
%%time
randa = random_matrix(600, 100)
randb = random_matrix(100, 600)
x = matmul(randa, randb)

Wall time: 10.5 s


In [41]:
%%time
randa2 = random_matrix(400,500)
randb2 = random_matrix(500,400)
x2 = matmul(randa2, randb2)

Wall time: 24.4 s


In [42]:
# Maybe that's not that bad?  Let's try a simpler case.
def python_dot_product(xs, ys):
    return sum(x * y for x, y in zip(xs, ys))

In [43]:
%%fortran
subroutine fortran_dot_product(xs, ys, result)
    double precision, intent(in) :: xs(:)
    double precision, intent(in) :: ys(:)
    double precision, intent(out) :: result
    
    result = sum(xs * ys)
end

RuntimeError: f2py failed, see output

In [None]:
list_data = [float(i) for i in range(100000)]
array_data = np.array(list_data)

In [None]:
list_data2 = [float(i) for i in range(500000)]
array_data2 = np.array(list_data2)

In [None]:
%%time
python_dot_product(list_data, list_data)

In [None]:
%%time
python_dot_product(list_data2, list_data2)

In [None]:
%%time
fortran_dot_product(array_data, array_data)

In [None]:
%%time
fortran_dot_product(array_data2, array_data2)

<center><img src="images/sloth.gif" alt="Drawing" style="width: 1080px;"/></center>


## Why is the Python Version so Much Slower?

In [None]:
# Dynamic typing.
def mul_elemwise(xs, ys):
    return [x * y for x, y in zip(xs, ys)]

mul_elemwise([1, 2, 3, 4], [1, 2 + 0j, 3.0, 'four'])
#[type(x) for x in _]

In [None]:
[type(x) for x in _]

In [None]:
mul_elemwise([3, 2, 3, 4], ["tres",73,2+10.1,3.4])


In [None]:
# Interpretation overhead.
source_code = 'a + b * c'
bytecode = compile(source_code, '', 'eval')
import dis; dis.dis(bytecode)

In [None]:
source_code2 = 'a**b'
bytecode2 = compile(source_code2, '', 'eval')
dis.dis(bytecode2)

## Why is the Python Version so Slow?
- Dynamic typing means that every single operation requires dispatching on the input type.
- Having an interpreter means that every instruction is fetched and dispatched at runtime.
- Other overheads:
  - Arbitrary-size integers.
  - Reference-counted garbage collection.

> This is the paradox that we have to work with when we're doing scientific or numerically-intensive Python. What makes Python fast for development -- this high-level, interpreted, and dynamically-typed aspect of the language -- is exactly what makes it slow for code execution.

- Jake VanderPlas, [*Losing Your Loops: Fast Numerical Computing with NumPy*](https://www.youtube.com/watch?v=EEUXKG97YRw)

# What Do We Do?

<center><img src="images/runaway.gif" alt="Drawing" style="width: 50%;"/></center>

<center><img src="images/thisisfine.gif" alt="Drawing" style="width: 1080px;"/></center>

- Python is slow for numerical computation because it performs dynamic dispatch on every operation we perform...

- ...but often, we just want to do the same thing over and over in a loop!

- If we don't need Python's dynamicism, we don't want to pay (much) for it.

- **Idea:** Dispatch **once per operation** instead of **once per element**.

In [None]:
import numpy as np

data = np.array([1, 2, 3, 4])
data

In [None]:
data2 = np.array([3,4,5,6,7,8])
data2

In [None]:
data + data

In [None]:
data2 + data2

In [None]:
%%time
# Naive dot product
(array_data * array_data).sum()

In [None]:
%%time

(array_data2 * array_data2).sum()

In [None]:
%%time
# Built-in dot product.
array_data.dot(array_data)

In [None]:
%%time

array_data2.dot(array_data2)

In [None]:
%%time
fortran_dot_product(array_data, array_data)

In [None]:
%%time
fortran_dot_product(array_data2, array_data2)

In [None]:
# Numpy won't allow us to write a string into an int array.
print("data[0] ='foo' > ValueError: invalid literal for int() with base 10: 'foo'" )

In [None]:
print('data[3] = "notString" > ValueError: invalid literal for int() with base 10: "notString"')

In [None]:
# We also can't grow an array once it's created.
print(" data.append(3) > AttributeError: 'numpy.ndarray' object has no attribute 'append'")

In [None]:
print(" data2.append(34) > AttributeError: 'numpy.ndarray' object has no attribute 'append'")

In [None]:
# We **can** reshape an array though.
two_by_two = data.reshape(2, 2)
two_by_two

In [None]:
d2resh = data2.reshape(3, 2)
d2resh

Numpy arrays are:

- Fixed-type

- Size-immutable

- Multi-dimensional

- Fast\*

\* If you use them correctly.

# What's in an Array?

In [None]:
arr = np.array([1, 2, 3, 4, 5, 6], dtype='int16').reshape(2, 3)
print("Array:\n", arr, sep='')
print("===========")
print("DType:", arr.dtype)
print("Shape:", arr.shape)
print("Strides:", arr.strides)
print("Data:", arr.data.tobytes())

In [None]:
arr2 = np.array([10,20,30,40,50,55,1,2,3], dtype='int16').reshape(3, 3)
print("Array:\n", arr2, sep='')
print("===========")
print("DType:", arr2.dtype)
print("Shape:", arr2.shape)
print("Strides:", arr2.strides)
print("Data:", arr2.data.tobytes())

# Core Operations

- Vectorized **ufuncs** for elementwise operations.
- Fancy indexing and masking for selection and filtering.
- Aggregations across axes.
- Broadcasting

# UFuncs

UFuncs (universal functions) are functions that operate elementwise on one or more arrays.

In [None]:
data = np.arange(15).reshape(3, 5)
data

In [None]:
data2 = np.arange(9).reshape(3, 3)
data2

In [None]:
# Binary operators.
data * data

In [None]:
data2 * data2

In [None]:
# Unary functions.
np.sqrt(data)

In [None]:
# Unary functions.
np.sqrt(data2)

In [None]:
# Comparison operations
(data % 3) == 0

In [None]:
# Comparison operations
data2 !=3

In [None]:
# Boolean combinators.
((data % 2) == 0) & ((data % 3) == 0)

In [None]:
((data2 % 2) == 0) | ((data2 % 3) == 0)

In [None]:
# as of python 3.5, @ is matrix-multiply
data @ data.T

In [None]:
data2 @ data2.T

# UFuncs Review

- UFuncs provide efficient elementwise operations applied across one or more arrays.
- Arithmetic Operators (`+`, `*`, `/`)
- Comparisons (`==`, `>`, `!=`)
- Boolean Operators (`&`, `|`, `^`)
- Trigonometric Functions (`sin`, `cos`)
- Transcendental Functions (`exp`, `log`)

# Selections

We often want to perform an operation on just a subset of our data.

In [None]:
sines = np.sin(np.linspace(0, 3.14, 10))
cosines = np.cos(np.linspace(0, 3.14, 10))
sines

In [None]:
tang =  np.tan(np.linspace(0, 3.14, 5))
cosines2 = np.cos(np.linspace(0, 3.14, 5))
tang

In [None]:
# Slicing works with the same semantics as Python lists.
sines[0]

In [None]:
cosines[0]

In [None]:
sines[:3]  # First three elements  

In [None]:
cosines[:4]  

In [None]:
sines[5:]  # Elements from 5 on.

In [None]:
cosines2[3:]  

In [None]:
sines[::2]  # Every other element.

In [None]:
cosines[::2]  

In [None]:
# More interesting: we can index with boolean arrays to filter by a predicate.
print("sines:\n", sines)
print("sines > 0.5:\n", sines > 0.5)
print("sines[sines > 0.5]:\n", sines[sines > 0.5])

In [None]:
print("cosines:\n", cosines)
print("cosines > 0.3:\n", cosines > 0.3)
print("cosines[sines > 0.3]:\n", cosines[cosines > 0.3])

In [None]:
# We index with lists/arrays of integers to select values at those indices.
print(sines)
sines[[0, 4, 7]]

In [None]:
# We index with lists/arrays of integers to select values at those indices.
print(cosines)
cosines[[0,2,3,5,7,8]]

In [None]:
# Index arrays are often used for sorting one or more arrays.
unsorted_data = np.array([1, 3, 2, 12, -1, 5, 2])

In [None]:
desord2 = np.array([5,31,45,6,-12,2-3])

In [None]:
sort_indices = np.argsort(unsorted_data)
sort_indices

In [None]:
sort = np.argsort(desord2)
sort

In [None]:
unsorted_data[sort_indices]

In [None]:
desord2[sort]

In [None]:
market_caps = np.array([12, 6, 10, 5, 6])  # Presumably in dollars?
assets = np.array(['A', 'B', 'C', 'D', 'E'])

In [None]:
nums = np.array([15,23,46,2,1,-2,-98])  
assets2 = np.array(['A', 'B', 'C', 'D', 'E','F','G'])

In [None]:
# Sort assets by market cap by using the permutation that would sort market caps on ``assets``.
sort_by_mcap = np.argsort(market_caps)
assets[sort_by_mcap]

In [None]:
sort_by_num = np.argsort(nums)
assets2[sort_by_num]

In [None]:
# Indexers are also useful for aligning data.
print("Dates:\n", repr(event_dates))
print("Values:\n", repr(event_values))
print("Calendar:\n", repr(calendar))

In [None]:
print("Raw Dates:", event_dates)
print("Indices:", calendar.searchsorted(event_dates))
print("Forward-Filled Dates:", calendar[calendar.searchsorted(event_dates)])

On multi-dimensional arrays, we can slice along each axis independently.

In [None]:
data = np.arange(25).reshape(5, 5)
data

In [None]:
data2 = np.arange(16).reshape(4,4)
data2

In [None]:
data[:2, :2]  # First two rows and first two columns.

In [None]:
data2[:3, :3]  

In [None]:
data[:2, [0, -1]]  # First two rows, first and last columns.

In [None]:
data2[:3, [0, -2]] 

In [None]:
data[(data[:, 0] % 2) == 0]  # Rows where the first column is divisible by two.

In [None]:
data2[(data2[:, 3] % 3) == 0]  # Rows where the first column is divisible by two.

# Selections Review

- Indexing with an integer removes a dimension.
- Slicing operations work on Numpy arrays the same way they do on lists.
- Indexing with a boolean array filters to True locations.
- Indexing with an integer array selects indices along an axis.
- Multidimensional arrays can apply selections independently along different axes.

## Reductions

Functions that reduce an array to a scalar.

$Var(X) = \frac{1}{N}\sqrt{\sum_{i=1}^N (x_i - \bar{x})^2}$

In [None]:
def variance(x):
    return ((x - x.mean()) ** 2).sum() / len(x)

In [None]:
variance(np.random.standard_normal(1000))

In [None]:
variance(np.random.standard_normal(500))

- `sum()` and `mean()` are both **reductions**.

- In the simplest case, we use these to reduce an entire array into a single value...

In [None]:
data = np.arange(30)
data.mean()

In [None]:
data2 = np.arange(100)
data2.mean()

- ...but we can do more interesting things with multi-dimensional arrays.

In [None]:
data = np.arange(30).reshape(3, 10)
data

In [None]:
data2 = np.arange(100).reshape(5, 20)
data2

In [None]:
data.mean()

In [None]:
data2.mean()

In [None]:
data.mean(axis=0)

In [None]:
data2.mean(axis=0)

In [None]:
data.mean(axis=1)

In [None]:
data2.mean(axis=1)

## Reductions Review

- Reductions allow us to perform efficient aggregations over arrays.
- We can do aggregations over a single axis to collapse a single dimension.
- Many built-in reductions (`mean`, `sum`, `min`, `max`, `median`, ...).

# Broadcasting

In [None]:
row = np.array([1, 2, 3, 4])
column = np.array([[1], [2], [3]])
print("Row:\n", row, sep='')
print("Column:\n", column, sep='')

In [None]:
row2 = np.array([5,3,2,1,5,6])
column2 = np.array([[1], [2], [3], [1], [2], [3], [1]])
print("Row:\n", row2, sep='')
print("Column:\n", column2, sep='')

In [None]:
row + column

In [None]:
row2 + column2

<center><img src="images/broadcasting.png" alt="Drawing" style="width: 60%;"/></center>

<h5>Source: http://www.scipy-lectures.org/_images/numpy_broadcasting.png</h5>

In [None]:
# Broadcasting is particularly useful in conjunction with reductions.
print("Data:\n", data, sep='')
print("Mean:\n", data.mean(axis=0), sep='')
print("Data - Mean:\n", data - data.mean(axis=0), sep='')

In [None]:
# Broadcasting is particularly useful in conjunction with reductions.
print("Data:\n", data2, sep='')
print("Mean:\n", data2.mean(axis=0), sep='')
print("Data - Mean:\n", data2 - data2.mean(axis=0), sep='')

# Broadcasting Review

- Numpy operations can work on arrays of different dimensions as long as the arrays' shapes are still "compatible".
- Broadcasting works by "tiling" the smaller array along the missing dimension.
- The result of a broadcasted operation is always at least as large in each dimension as the largest array in that dimension.

# Numpy Review

- Numerical algorithms are slow in pure Python because the overhead dynamic dispatch dominates our runtime.

- Numpy solves this problem by:
  1. Imposing additional restrictions on the contents of arrays.
  2. Moving the inner loops of our algorithms into compiled C code.

- Using Numpy effectively often requires reworking an algorithms to use vectorized operations instead of for-loops, but the resulting operations are usually simpler, clearer, and faster than the pure Python equivalent.

<center><img src="images/unicorn.jpg" alt="Drawing" style="width: 75%;"/></center>

Numpy is great for many things, but...

- Sometimes our data is equipped with a natural set of **labels**:
  - Dates/Times
  - Stock Tickers
  - Field Names (e.g. Open/High/Low/Close)

- Sometimes we have **more than one type of data** that we want to keep grouped together.
  - Tables with a mix of real-valued and categorical data.

- Sometimes we have **missing** data, which we need to ignore, fill, or otherwise work around.

<center><img src="images/panda-wrangling.gif" alt="Drawing" style="width: 75%;"/></center>

<center><img src="images/pandas_logo.png" alt="Drawing" style="width: 75%;"/></center>


Pandas extends Numpy with more complex data structures:

- `Series`: 1-dimensional, homogenously-typed, labelled array.
- `DataFrame`: 2-dimensional, semi-homogenous, labelled table.

Pandas also provides many utilities for: 
- Input/Output
- Data Cleaning
- Rolling Algorithms
- Plotting

# Selection in Pandas

In [None]:
s = pd.Series(index=['a', 'b', 'c', 'd', 'e'], data=[1, 2, 3, 4, 5])
s

In [None]:
s2 = pd.Series(index=['1', '2', '3', '4'], data=[1, 2, 4, 8])
s2

In [None]:
# There are two pieces to a Series: the index and the values.
print("The index is:", s.index)
print("The values are:", s.values)

In [None]:
print("The index is:", s2.index)
print("The values are:", s2.values)

In [None]:
# We can look up values out of a Series by position...
s.iloc[0]

In [None]:
s2.iloc[0]

In [None]:
s.loc['a']

In [None]:
s2.loc['4']

In [None]:
# Slicing works as expected...
s.iloc[:2]

In [None]:

s2.iloc[:4]

In [None]:
# ...but it works with labels too!
s.loc[:'c']

In [None]:

s2.loc[:'3']

In [None]:
# Fancy indexing works the same as in numpy.
s.iloc[[0, -1]]

In [None]:

s2.iloc[[0, -1,-2]]

In [None]:
# As does boolean masking.
s.loc[s > 2]

In [None]:

s2.loc[s2 == 2]

In [None]:
# Element-wise operations are aligned by index.
other_s = pd.Series({'a': 10.0, 'c': 20.0, 'd': 30.0, 'z': 40.0})
other_s

In [None]:

other_s2 = pd.Series({'1': 10.1, '3': 20.0, '9': 30.4, '12': 40.0})
other_s2

In [None]:
s + other_s

In [None]:
s2 + other_s2

In [None]:
# We can fill in missing values with fillna().
(s + other_s).fillna(0.0)

In [None]:
(s2 + other_s2).fillna(0.0)

In [None]:
import pandas_datareader as pdr
import datetime

In [None]:
# Most real datasets are read in from an external file format.
aapl = pdr.get_data_yahoo('AAPL',start=datetime.datetime(2010,1,4),end=datetime.datetime(2012,2,1))
aapl.head(4)

In [None]:
# Slicing generalizes to two dimensions as you'd expect:
aapl.iloc[:2, :2]

In [None]:
aapl.iloc[:5, ]

In [None]:
aapl.loc[pd.Timestamp('2010-02-01'):pd.Timestamp('2010-02-04'), ['Close', 'Volume']]

In [None]:
aapl.loc[pd.Timestamp('2011-01-01'):pd.Timestamp('2011-01-10'), ['Open']]

# Rolling Operations

<center><img src="images/rolling.gif" alt="Drawing" style="width: 75%;"/></center>

In [None]:
aapl.rolling(5)[['Close', 'Adj Close']].mean().plot();

In [None]:
aapl.rolling(4)[['Close','Open','Adj Close']].mean().plot();

In [None]:
# Drop `Volume`, since it's way bigger than everything else.
aapl.drop('Volume', axis=1).resample('2W').max().plot();

In [None]:
aapl.drop("Volume", axis=1).resample('1A').first().plot();

In [None]:
# 30-day rolling exponentially-weighted stddev of returns.
aapl['Close'].pct_change().ewm(span=30).std().plot();

In [None]:
aapl['Adj Close'].pct_change().ewm(span=15).std().plot();

# "Real World" Data

In [None]:
from io import BytesIO
import os
from urllib.parse import urlencode

import requests
import numpy as np
import pandas as pd


def read_avocadata(start_date, end_date, cache_loc='avocadata.html'):
    """Download avocado data to a dataframe.
    Parameters
    ----------
    """
    start_date = pd.Timestamp(start_date)
    end_date = pd.Timestamp(end_date)
    base_url = 'https://www.marketnews.usda.gov/mnp/fv-report-retail'
    query_params = {
        'class': ['FRUITS'],
        'commodity': ['AVOCADOS'],
        'compareLy': ['No'],
        'endDate': [end_date.strftime("%m/%d/%Y")],
        'format': ['excel'],
        'organic': ['ALL'],
        'portal': ['fv'],
        'region': ['ALL'],
        'repDate': [start_date.strftime("%m/%d/%Y")],
        'type': ['retail'],
    }

    url = base_url + '?' + urlencode(query_params, doseq=1)

    if not os.path.exists(cache_loc):
        resp = requests.get(url, stream=True)
        resp.raise_for_status()

        with open(cache_loc, 'wb') as f:
            for block in resp.iter_content(chunk_size=4096):
                f.write(block)
        f.close()

    with open(cache_loc, 'rb') as f:
        frame = pd.read_html(f, header=0)[0]

    # Cleanup
    frame = frame[frame['Unit'] == 'each']
    frame['Organic'] = (frame['Organic'] == 'Y')
    frame['Variety'].replace(
        {'VARIOUS GREENSKIN VARIETIES': 'GREENSKIN'},
        inplace=True,
    )
    frame['Date'] = pd.to_datetime(frame['Date'].values, utc=True)

    frame['Region'] = frame['Region'].str.replace(' U.S.', '')
    frame['Region'] = frame['Region'].str.replace(' ', '_')

    # Drop useless columns.
    return frame.drop(
        ['Class', 'Commodity', 'Environment', 'Unit', '% Marked Local'],
        axis=1,
    )

In [None]:


avocados = read_avocadata('2014', '2016')
avocados.head()

In [None]:
# Unlike numpy arrays, pandas DataFrames can have a different dtype for each column.
avocados.dtypes

In [None]:
# What's the regional average price of a HASS avocado every day?
hass = avocados[avocados.Variety == 'HASS']
hass.groupby(['Date', 'Region'])['Weighted Avg Price'].mean().unstack().ffill().plot();

In [None]:
org = avocados[avocados.Organic == True]
org.groupby(['Date', 'Region'])['Number of Stores'].mean().unstack().ffill().plot();

In [None]:
def _organic_spread(group):

    if len(group.columns) != 2:
        return pd.Series(index=group.index, data=0.0)
    
    is_organic = group.columns.get_level_values('Organic').values.astype(bool)
    organics = group.loc[:, is_organic].squeeze()
    non_organics = group.loc[:, ~is_organic].squeeze()
    diff = organics - non_organics
    return diff

def organic_spread_by_region(df):
    """What's the difference between the price of an organic 
    and non-organic avocado within each region?
    """
    return (
        df
        .set_index(['Date', 'Region', 'Organic'])
         ['Weighted Avg Price']
        .unstack(level=['Region', 'Organic'])
        .ffill()
        .groupby(level='Region', axis=1)
        .apply(_organic_spread)
    )

In [None]:
organic_spread_by_region(hass).plot();
plt.gca().set_title("Daily Regional Organic Spread");
plt.legend(bbox_to_anchor=(1, 1));

In [None]:
spread_correlation = organic_spread_by_region(hass).corr()
spread_correlation

In [None]:
import seaborn as sns
grid = sns.clustermap(spread_correlation, annot=True)
fig = grid.fig
axes = fig.axes
ax = axes[2]
ax.set_xticklabels(ax.get_xticklabels(), rotation=45);

## Ejemplo con datos abiertos del gobierno (https://www.datos.gov.co):

Con una api para leer archivos csv
Descargamos un archivo de Casos positivos de COVID-19 en Colombia y lo subimos a los archivos del notebook.

In [None]:
covid = pd.read_csv('Casos_positivos_de_COVID-19_en_Colombia.csv')


In [None]:
covid.head()

In [None]:
covid.dtypes

Creamos una gráfica con los datos de los primeros 600 registros, del promedio personas recuperadas de covid19 en cada dia, divididas en categorias de Sexo y según su edad.

In [None]:
covid600=covid.head(600)
recuperados = covid600[covid.Recuperado == 'Recuperado'];

recuperados.groupby(['fecha reporte web', 'Sexo'])['Edad'].mean().unstack().ffill().plot();


# Pandas Review

- Pandas extends numpy with more complex datastructures and algorithms.
- If you understand numpy, you understand 90% of pandas.
- `groupby`, `set_index`, and `unstack` are powerful tools for working with categorical data.
- Avocado prices are surprisingly interesting :)

# Thanks!