In [2]:
import pandas as pd
import numpy as np
import geopandas

In [3]:
#: Our example dataset from seaborn, without loading the seaborn package.
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
iris.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Selecting Data

`df[]` is overloaded and will do different things depending on what you pass to it:
1. **string**: returns all rows in the indicated **column** as a series
2. **list of strings**: returns all rows the indicated **columns** as a single data frame. General case of #1, except returns dataframe instead of series
3. **python-esque slices**: select **rows** (either by label or by index) (needs better example dataset)
4. **sequence of booleans**: all *rows* whose index matches the sequence index of a true value. This is where magic happens, because we can put conditional statements as the boolean sequence. The condition is evaluated on each row in the given column, and the resulting true/false value is passed to the indexing operator `[]` to select specific rows. 
   *The length of the sequence must match the number of rows in the dataframe.*

In [4]:
#: Single string
iris['petal_length'].head(5)

0    1.4
1    1.4
2    1.3
3    1.5
4    1.4
Name: petal_length, dtype: float64

In [5]:
#: List of strings
iris[['petal_length', 'petal_width']].head(5)

Unnamed: 0,petal_length,petal_width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2


In [6]:
#: Slicing
iris[3:5]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
#: Sequence of booleans
iris_head = iris.head(5)
sequence = ([True, False, True, False, True])
iris_head[sequence]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
#: Conditional as sequence of booleans
#: First, we build our sequence using [] as column/series selector
series_to_test = iris_head['sepal_length']
series_to_test

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal_length, dtype: float64

In [9]:
#: Then we evaluate the condition on that series:
test = series_to_test > 4.8
test

0     True
1     True
2    False
3    False
4     True
Name: sepal_length, dtype: bool

In [10]:
#: Finally, we apply the boolean series to the [] as a sequence of booleans 
iris_head[test]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [11]:
#: Or, in one line:
iris_head[iris_head['sepal_length'] > 4.8]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [12]:
#: Doing compound conditionals requires parens to separate conditionals and &, |, ~ operators:
iris_head[(iris_head['sepal_length'] > 4.8) & (iris_head['sepal_width'] > 3.4)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [13]:
#: Select all except certain values
iris[~iris['species'].isin(['setosa', 'virginica'])]
#: isin() can take other collections, like standalone series or series from other dataframs
other_values = pd.Series(['setosa', 'virginica'])
iris[~iris['species'].isin(other_values)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor
55,5.7,2.8,4.5,1.3,versicolor
56,6.3,3.3,4.7,1.6,versicolor
57,4.9,2.4,3.3,1.0,versicolor
58,6.6,2.9,4.6,1.3,versicolor
59,5.2,2.7,3.9,1.4,versicolor


# Grouping with groupby()

We can use the `.groupby()` method on a dataframe to divide it into specific groups and then run a function on each of those groups. This is a great way to get quick descriptive statistics about different groups within your dataset.

The simplest use of `.groupby()` is to pass a label or list of labels to define a series you want to group by. It then segregates your data based on the unique values within that series and runs whatever function you call afterwards on those groups. 

In [14]:
#: groupby: specify a grouping, optionally select series using [], then return results of function based on this grouping
iris.groupby('species')['sepal_length'].mean()

species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: sepal_length, dtype: float64

In [15]:
#: Normal- passing label from same dataframe to determine groups
print(iris.groupby('species')['sepal_length'].count())
#: Does not work- tries to align values in other_values (?), only returns two rows
print(iris.groupby(other_values)['sepal_length'].count())
#: Works- We're explicitely passing the species series from the iris dataframe. Same output, different method of specifying groups.
print(iris.groupby(iris['species'])['sepal_length'].count())

species
setosa        50
versicolor    50
virginica     50
Name: sepal_length, dtype: int64
setosa       1
virginica    1
Name: sepal_length, dtype: int64
species
setosa        50
versicolor    50
virginica     50
Name: sepal_length, dtype: int64


# Selecting a Single Cell/Value

Sometimes you want to be able to access a single cell or value. If you have good labels, the `.loc[]` method allows you to access specific rows, columns, or cells: `df.loc['row', 'col']`

If you are thinking of a dataframe like a fancy 2D list, you may be tempted to used "chained assignment" operators: `df['row']['column']`. However, this is evaluated as two seperate, chained commands: first, `df['row']` returns the row from the dataframe, and then `[column]` returns the column from the row. pandas makes no guarantee about the first operation—`df[row]`—returning a view of the original data or a copy. This may not matter for a one-off read operation, but will give inconsistent results when trying to write to the value. If the second operation got a copy, the copy will be updated and then thrown away while the original remains untouched.

If you want to assign values, use the `df.loc['row', 'col']` syntax.

In [16]:
#: Selecting single values for reading or writing

#: print the second row
print(iris[1:2])

#: BAD: Chained assignment
iris.loc[1]['sepal_length'] = 20
#: iris.loc[1] returns an object, no guarantee if it's a view or copy, and then __getitem__ is called on this for 'sepal_length'
#: When we try to look at this value again, maybe it will be set to 20, and maybe not
print(iris[1:2])

#: Good: 
iris.loc[1, 'sepal_length'] = 40
print(iris[1:2])

   sepal_length  sepal_width  petal_length  petal_width species
1           4.9          3.0           1.4          0.2  setosa
   sepal_length  sepal_width  petal_length  petal_width species
1           4.9          3.0           1.4          0.2  setosa
   sepal_length  sepal_width  petal_length  petal_width species
1          40.0          3.0           1.4          0.2  setosa


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [17]:
#: pandas is not like normal python, where you use chained assignment to access nested lists
foo = [['a', 'b', 'c'], ['d', 'e', 'f']]
print(foo)
foo[0][0] = 'z'
print(foo)


[['a', 'b', 'c'], ['d', 'e', 'f']]
[['z', 'b', 'c'], ['d', 'e', 'f']]


# WIP

In [21]:
#: Check if value is in index:
print(1 in iris.index)
print(1000 in iris.index)

#: Check if value in column
#: Wrong: 'in df['column']' checks if it's in the index of that series
print('setosa' in iris['species'])
#: Right: use ser.array to get a zero-copy reference of the underlying data 
#: https://dev.pandas.io/pandas-blog/pandas-extension-arrays.html
print('setosa' in iris['species'].array)
print('foo' in iris['species'].array)


True
False
False
True
False


In [None]:
# Get types
iris.dtypes