In [2]:
import pandas
import numpy as np
import pprint

pretty = pprint.PrettyPrinter(indent=4)

obj2 = pandas.Series([4, 7, -5, 2], index=['a', 'b', 'c', 'd'])

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj3 = pandas.Series(sdata)
obj4 = pandas.Series(sdata, index=states)  # Utah is left out

# print(obj2[['a', 'c']])
# print(obj2[obj2 > 2])
# print(pandas.isnull(obj4))
# print(pandas.notnull(obj4))

# print('\n\n')

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
new_column_order = ['year', 'state', 'pop']
frame = pandas.DataFrame(data)
frame2 = pandas.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                          index=['one', 'two', 'three', 'four', 'five', 'six'])
# print(frame)
# simple to swap columns
frame = pandas.DataFrame(data, columns=new_column_order)
# print(frame)
states = frame['state']
# print('\nstates: \n', states)
# print('\n', frame2)

frame2['debt'] = 19
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,19
two,2001,Ohio,1.7,19
three,2002,Ohio,3.6,19
four,2001,Nevada,2.4,19
five,2002,Nevada,2.9,19
six,2003,Nevada,3.2,19


In [3]:
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [4]:
v = pandas.Series([-2,5,77], index=['one', 'four', 'six'])
frame2['debt'] = v
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,-2.0
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,5.0
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,77.0


In [5]:
# dynamically craete a new column using an expression to do vectorization <3
frame2['eastern'] = frame2['state'] == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,-2.0,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,5.0,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,77.0,False


In [6]:
# delete the previously created column
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [7]:
pop = {
    'Nevada': {2001: 2.4, 2002: 2.9},
    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}
}
frame3 = pandas.DataFrame(pop)
# Transpose
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [8]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [9]:
pdata = {
    'Ohio': frame3['Ohio'][:2],
    'Nevada': frame3['Nevada'][:-1]
}

pandas.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [10]:
# we can set the index and column names
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [11]:
# .values works w/dataframes and series
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [12]:
checks = [
    'Ohio' in frame3.columns, 2002 in frame3.index,
    'California' in frame3.columns, 2021 in frame3.index
]
checks

[True, True, False, False]

In [13]:
# pandas indexes are like fixed-width python sets, but allow dupes
idx = pandas.Index(['foo', 'foo', 'bar', 'bar', 'tro'])
idx2 = pandas.Index(['per', 'cep', 'bar', 'per'])

ops = [
    idx, idx2,
    # set like ops
    idx.append(idx2),
    idx.difference(idx2),
    idx.intersection(idx2),
    idx.union(idx2),
    idx.isin(idx2),
    idx.delete(2),
    idx.drop('foo'),
    #idx.insert('tro', 2), # not sure how to do this one
    idx.is_monotonic,
    idx.is_unique,
    idx.unique()
]
ops

[Index(['foo', 'foo', 'bar', 'bar', 'tro'], dtype='object'),
 Index(['per', 'cep', 'bar', 'per'], dtype='object'),
 Index(['foo', 'foo', 'bar', 'bar', 'tro', 'per', 'cep', 'bar', 'per'], dtype='object'),
 Index(['foo', 'tro'], dtype='object'),
 Index(['bar'], dtype='object'),
 Index(['bar', 'bar', 'cep', 'foo', 'foo', 'per', 'per', 'tro'], dtype='object'),
 array([False, False,  True,  True, False]),
 Index(['foo', 'foo', 'bar', 'tro'], dtype='object'),
 Index(['bar', 'bar', 'tro'], dtype='object'),
 False,
 False,
 Index(['foo', 'bar', 'tro'], dtype='object')]

In [14]:
obj = pandas.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [15]:
# re-index
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [16]:
obj3 = pandas.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [17]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [18]:
ra1 = np.arange(9).reshape(3,3)
frame = pandas.DataFrame(
    ra1, index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']
)
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [19]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [20]:
states = ['Texas', 'Utah', 'California']
frame.reindex(states)

Unnamed: 0,Ohio,Texas,California
Texas,,,
Utah,,,
California,,,


In [21]:
# frame.loc[['a','b','c','d'], states] # won't work anymore

obj = pandas.Series(np.arange(5.), index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [22]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [23]:
obj.drop(['d', 'c'])
data = pandas.DataFrame(np.arange(16).reshape((4, 4)),
                index=['Ohio', 'Colorado', 'Utah', 'New York'],
                columns=['one', 'two', 'three', 'four'])
cdata = data.copy()
drop1 = data.drop('Utah')
# axis=1 means columns
drop2 = data.drop(['three', 'four'], axis=1)
drop3 = data.drop(['two'], axis=1)
# drop in place
data.drop(['Utah'], axis=0, inplace=True)
drop_set = [cdata, drop1, drop2, drop3, data]
drop_set

[          one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15,
           one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7
 New York   12   13     14    15,
           one  two
 Ohio        0    1
 Colorado    4    5
 Utah        8    9
 New York   12   13,
           one  three  four
 Ohio        0      2     3
 Colorado    4      6     7
 Utah        8     10    11
 New York   12     14    15,
           one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7
 New York   12   13     14    15]

# Indexing, Selection, & Filtering for DataFrames and Series

In [24]:
obj = pandas.Series(np.arange(10) * 5.75, index=['a','b','c','d','e','f','g','h','i','j'])
index_select_filter = [
    obj, obj[1], obj[['b']],
    obj[['a','e','j']],
    obj[2:5],
    obj[obj > 5],
    obj['h':'j'],
]
obj['a':'c'] = 5
index_select_filter

[a     5.00
 b     5.00
 c     5.00
 d    17.25
 e    23.00
 f    28.75
 g    34.50
 h    40.25
 i    46.00
 j    51.75
 dtype: float64,
 5.75,
 b    5.75
 dtype: float64,
 a     0.00
 e    23.00
 j    51.75
 dtype: float64,
 c     5.00
 d    17.25
 e    23.00
 dtype: float64,
 b     5.75
 c    11.50
 d    17.25
 e    23.00
 f    28.75
 g    34.50
 h    40.25
 i    46.00
 j    51.75
 dtype: float64,
 h    40.25
 i    46.00
 j    51.75
 dtype: float64]

In [25]:
data = pandas.DataFrame(np.arange(16).reshape(4,4),
                     index=['Ohio', 'Colorado', 'Utah', 'New York'],
                     columns=['one', 'two', 'three', 'four'])
index_select_filter = [
    data,
    # select columns
    data[['two']],
    data[['three', 'four']],
    # a few special cases that'll select rows
    data[:2], # select 1st two rows
    data[data['three'] > 4], # boolean array
]
index_select_filter

[          one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15,
           two
 Ohio        1
 Colorado    5
 Utah        9
 New York   13,
           three  four
 Ohio          2     3
 Colorado      6     7
 Utah         10    11
 New York     14    15,
           one  two  three  four
 Ohio        0    1      2     3
 Colorado    4    5      6     7,
           one  two  three  four
 Colorado    4    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15]

In [26]:
# boolean DataFrame
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [27]:
# set all vals < 5 to 0
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


## loc & iloc

In [28]:
data.loc['Ohio', 'two'] = 22
# select column `two` and `three` from the Colorado column index
loc_iloc = [
    data,
    data.loc['Colorado', ['two', 'three']],
    data.iloc[2, [3,0,1]],
    # select Utah row
    data.iloc[2],
    # select rows [Colorado, Utah] col [four, one, two]
    data.iloc[[1,2], [3,0,1]],
    data.loc[:'Utah', 'one':'three'],
    data.loc[:'Utah', 'two'],
    data.iloc[:, :3][data['three'] > 5]
]
loc_iloc

[          one  two  three  four
 Ohio        0   22      0     0
 Colorado    0    5      6     7
 Utah        8    9     10    11
 New York   12   13     14    15,
 two      5
 three    6
 Name: Colorado, dtype: int64,
 four    11
 one      8
 two      9
 Name: Utah, dtype: int64,
 one       8
 two       9
 three    10
 four     11
 Name: Utah, dtype: int64,
           four  one  two
 Colorado     7    0    5
 Utah        11    8    9,
           one  two  three
 Ohio        0   22      0
 Colorado    0    5      6
 Utah        8    9     10,
 Ohio        22
 Colorado     5
 Utah         9
 Name: two, dtype: int64,
           one  two  three
 Colorado    0    5      6
 Utah        8    9     10
 New York   12   13     14]

### Using pandas.DataFrame.add()

In [29]:
df1 = pandas.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pandas.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [30]:
equivalents = [
    1 / df1,
    df1.rdiv(1)
]
equivalents

[       a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909,
        a         b         c         d
 0    inf  1.000000  0.500000  0.333333
 1  0.250  0.200000  0.166667  0.142857
 2  0.125  0.111111  0.100000  0.090909]

### Operations between DataFrame and Series

In [32]:
arr = np.arange(12.).reshape(3,4)
np_broadcasting = [
    arr,
    arr[0],
    # broadcasting operation
    arr - arr[0]
]
np_broadcasting

[array([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.]]),
 array([0., 1., 2., 3.]),
 array([[0., 0., 0., 0.],
        [4., 4., 4., 4.],
        [8., 8., 8., 8.]])]

In [41]:
# broadcasting is similar with dataframes and series
frame = pandas.DataFrame(np.arange(12.).reshape((4, 3)),
                         columns=list('bde'),
                         index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
series2 = pandas.Series(range(3), index=list('bef'))
series3 = frame['d']
frame_series_broadcasting = [
    frame, '','',
    series, '','',
    frame - series, '','',
    # will re-index and union
    frame + series2, '','',
    frame.sub(series3, axis='index')
]
frame_series_broadcasting

[          b     d     e
 Utah    0.0   1.0   2.0
 Ohio    3.0   4.0   5.0
 Texas   6.0   7.0   8.0
 Oregon  9.0  10.0  11.0,
 '',
 '',
 b    0.0
 d    1.0
 e    2.0
 Name: Utah, dtype: float64,
 '',
 '',
           b    d    e
 Utah    0.0  0.0  0.0
 Ohio    3.0  3.0  3.0
 Texas   6.0  6.0  6.0
 Oregon  9.0  9.0  9.0,
 '',
 '',
           b   d     e   f
 Utah    0.0 NaN   3.0 NaN
 Ohio    3.0 NaN   6.0 NaN
 Texas   6.0 NaN   9.0 NaN
 Oregon  9.0 NaN  12.0 NaN,
           b    d    e
 Utah   -1.0  0.0  1.0
 Ohio   -1.0  0.0  1.0
 Texas  -1.0  0.0  1.0
 Oregon -1.0  0.0  1.0]

## Function Application and Mapping

In [46]:
def max_minus_min(x):
    debug = 1
    return x.max() - x.min()

def min_max_series(x):
    debug = 1
    return pandas.Series([x.min(), x.max()], index=['min', 'max'])

lambda_format = lambda x: '%.2f' % x

frame = pandas.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                         index=['Utah', 'Ohio', 'Texas', 'Oregon'])
function_maps = [
    frame, '','',
    # np.abs() will make all the values positive
    #np.abs(frame), '','',
    # apply a function to each column of $frame
    frame.apply(max_minus_min, axis='index'), '','',
    # apply a function to each row of $frame
    frame.apply(max_minus_min, axis='columns'), '','',
    # apply a function that returns a series
    frame.apply(min_max_series), '','',
    # using .applymap(), this functions works on all cells in the df
    frame.applymap(lambda_format), '','',
    # compare the difference to .map(), this gets called on a single column
    frame['e'].map(lambda_format)
]
function_maps

[               b         d         e
 Utah   -0.720292  0.697719 -0.757862
 Ohio    0.442913  0.554588  1.218932
 Texas  -0.134889  0.013728  0.839697
 Oregon  0.827631 -1.418712  2.335043,
 '',
 '',
 b    1.547923
 d    2.116431
 e    3.092905
 dtype: float64,
 '',
 '',
 Utah      1.455581
 Ohio      0.776019
 Texas     0.974586
 Oregon    3.753755
 dtype: float64,
 '',
 '',
             b         d         e
 min -0.720292 -1.418712 -0.757862
 max  0.827631  0.697719  2.335043,
 '',
 '',
             b      d      e
 Utah    -0.72   0.70  -0.76
 Ohio     0.44   0.55   1.22
 Texas   -0.13   0.01   0.84
 Oregon   0.83  -1.42   2.34,
 '',
 '',
 Utah      -0.76
 Ohio       1.22
 Texas      0.84
 Oregon     2.34
 Name: e, dtype: object]

## Sorting & Ranking