In [2]:
import pandas as pd
import numpy as np

In [3]:
dates = pd.date_range('1/1/2000', periods = 8)
df = pd.DataFrame(np.random.randn(8,4),
                 index=dates, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.313735,-1.315145,-0.087447,-0.180478
2000-01-02,1.05114,0.762399,1.123555,0.581983
2000-01-03,0.225854,0.755792,-0.05396,0.156325
2000-01-04,0.514486,-1.374129,-0.736705,2.126524
2000-01-05,0.148738,-0.117015,0.910745,1.352871
2000-01-06,-1.372446,-0.593125,0.646785,1.242598
2000-01-07,-0.50295,0.122695,0.862757,1.117355
2000-01-08,-1.739605,0.199066,-1.079961,0.576653


In [4]:
s = df['A'] #returns a series
s[dates[5]] #uses index accessed by position

-1.3724458969452928

In [5]:
df[['A','B']][:2] #selects columns in first [] then rows in second []

Unnamed: 0,A,B
2000-01-01,-0.313735,-1.315145
2000-01-02,1.05114,0.762399


In [6]:
df.loc['20000102':'20000104',['A','B']] #fist arg is rows, second is col 

Unnamed: 0,A,B
2000-01-02,1.05114,0.762399
2000-01-03,0.225854,0.755792
2000-01-04,0.514486,-1.374129


In [7]:
#Assigning with iloc
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
x.iloc[1] = {'x':9, 'y':99}
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


In [8]:
df = pd.DataFrame({'one': [1., 2., 3.]})
df.two = [4, 5, 6] #cannot create a new column with attribute access

  


In [9]:
s[::-1] #reversing

2000-01-08   -1.739605
2000-01-07   -0.502950
2000-01-06   -1.372446
2000-01-05    0.148738
2000-01-04    0.514486
2000-01-03    0.225854
2000-01-02    1.051140
2000-01-01   -0.313735
Freq: -1D, Name: A, dtype: float64

In [10]:
df1 = pd.DataFrame(np.random.randn(6,4),
                  index=list('abcdef'),
                  columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.977417,0.560201,0.115741,-0.795386
b,0.614536,0.926422,0.787046,-0.783104
c,-0.541117,-1.04802,0.713564,-1.491643
d,-0.600972,0.403198,-0.828831,-0.570005
e,0.681902,-1.216068,0.457316,-1.232398
f,-0.473426,-1.215047,0.078396,-0.615548


In [11]:
#Return a cross section
df1.xs('a')

#Another way
df1.loc['a']

A    0.977417
B    0.560201
C    0.115741
D   -0.795386
Name: a, dtype: float64

In [12]:
#Getting values with a boolean array
df1.loc['a'] > 0
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,A,B,C
a,0.977417,0.560201,0.115741
b,0.614536,0.926422,0.787046
c,-0.541117,-1.04802,0.713564
d,-0.600972,0.403198,-0.828831
e,0.681902,-1.216068,0.457316
f,-0.473426,-1.215047,0.078396


In [13]:
# Return an explicit value
df1.loc['a','A']

0.9774173150204551

In [14]:
#Slicing does not care about order but rather about position. Is inclusive.
s = pd.Series(list('abcde'), index=[0,3,2,5,4])
s.loc[3:5]

3    b
2    c
5    d
dtype: object

In [15]:
s = s.sort_index()
s.loc[3:5]

3    b
4    e
5    d
dtype: object

In [16]:
# Indexing by callable
df1.loc[lambda x: x.A > 0, :]

Unnamed: 0,A,B,C,D
a,0.977417,0.560201,0.115741,-0.795386
b,0.614536,0.926422,0.787046,-0.783104
e,0.681902,-1.216068,0.457316,-1.232398


In [17]:
df1.iloc[:, lambda df: [0,1]]

Unnamed: 0,A,B
a,0.977417,0.560201
b,0.614536,0.926422
c,-0.541117,-1.04802
d,-0.600972,0.403198
e,0.681902,-1.216068
f,-0.473426,-1.215047


In [18]:
df1.A.loc[lambda s: s > 0]

a    0.977417
b    0.614536
e    0.681902
Name: A, dtype: float64

In [19]:
#Reindexing
s = pd.Series([1, 2, 3]) #Defualt index of 0,1,2
s = s.reindex([1,2,3]) #Loses what was at index zero and adds index 3
s

1    2.0
2    3.0
3    NaN
dtype: float64

In [20]:
# Selecting random samples
s = pd.Series(range(0,6,1))
s.sample() # Returns single row

1    1
dtype: int64

In [21]:
s.sample(n=3) #Returns 3 rows

5    5
1    1
2    2
dtype: int64

In [22]:
s.sample(frac=0.5)  #Returns a fraction of the rows

5    5
4    4
0    0
dtype: int64

In [23]:
# Default behavior is without replacement but can be changed
s.sample(n=3, replace=True)

1    1
0    0
0    0
dtype: int64

In [24]:
# Default behavior is for uniform probabilities but can be changed
s.sample(n=3, weights=[0,0,0.2,0.4,0.2,0.2]) # If they dont't sum to 1 they will be re-normalized

3    3
5    5
2    2
dtype: int64

In [25]:
# For a DataFrame you can use another column as the weights
df2 = pd.DataFrame({'col1': [9, 8, 7, 6],
                    'weight_column': [0.5, 0.4, 0.1, 0]})
df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
0,9,0.5
2,7,0.1
1,8,0.4


In [26]:
# Sample columns instead of rows with the axis argument
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
df3.sample(n=1, axis=1)

Unnamed: 0,col1
0,1
1,2
2,3


In [27]:
# Set a seed for the random number generator. Useful for replicating results
df3.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


In [28]:
# Using map()
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                    'c': np.random.randn(7)})

# Maps original value to something else based on key. In this case it uses a lambda
criterion = df2['a'].map(lambda x: x.startswith('t'))
df2[criterion]

Unnamed: 0,a,b,c
2,two,y,-0.893961
3,three,x,0.379189
4,two,y,-0.62668


In [29]:
# Using list comprehension is slower than using map
df2['a'][[x.startswith('t') for x in df2['a']]]

2      two
3    three
4      two
Name: a, dtype: object

In [30]:
# Using multiple criterion
df2.loc[criterion & (df2['b'] == 'x'), 'b':'c']

Unnamed: 0,b,c
3,x,0.379189


In [31]:
#Indexing with isin()
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s.isin([2,4,6])
s[s.isin([2,4,6])]

2    2
0    4
dtype: int64

In [32]:
# It can be used on the index object
type(s.index)
s[s.index.isin([2,4,6])]

4    0
2    2
dtype: int64

In [33]:
# Creating a multiIndex
s_mi = pd.Series(np.arange(6),
                index=pd.MultiIndex.from_product([[0,1],['a','b','c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int64

In [34]:
s_mi.iloc[s_mi.index.isin([(1,'a'),(2,'b'),(0,'c')])]

0  c    2
1  a    3
dtype: int64

In [35]:
# level refers to the level of the multiindex used in the previous arg
s_mi.iloc[s_mi.index.isin(['a','c','e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int64

In [36]:
s_mi.iloc[s_mi.index.isin([0], level=0)]

0  a    0
   b    1
   c    2
dtype: int64

In [37]:
#isin() for DataFrames
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
                   'ids2': ['a', 'n', 'c', 'n']})

values = ['a', 'b', 1, 3]

df.isin(values) #Checks the entire dataframe because no specific col or row is specified

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,False
3,False,False,False


In [38]:
# Using isin() with a dict matches the columns
values = {'ids':['a','b'], 'vals':[1,3]} #note ids2 is all False now
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,False
1,False,True,False
2,True,False,False
3,False,False,False


In [39]:
# Using all()
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}

row_mask = df.isin(values).all(1) #the '1' refers to the axis

#In this case, True must be present across the row
df[row_mask]

Unnamed: 0,vals,ids,ids2
0,1,a,a


In [40]:
#Using any()
row_mask = df.isin(values).any(1) #the '1' refers to the axis

#In this case, True must be present in at least one column of the row
df[row_mask]

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c


In [41]:
# Using where() maintains shape of the data
s[s > 0] # will only return values where condition is True

s.where(s > 0) # Returns all the data substituting NaN where False

4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [42]:
# For data frames where() is used under the hood in []
t_index = pd.date_range('01/01/2000', periods=8)
df = pd.DataFrame(np.random.randn(8,4),
                 index=t_index, columns=['A','B','C','D'])
df[df < 0]

Unnamed: 0,A,B,C,D
2000-01-01,,,-0.045468,-0.667316
2000-01-02,,,,
2000-01-03,-0.570332,,-0.624371,
2000-01-04,,-0.528085,,
2000-01-05,,-0.822426,-0.748668,
2000-01-06,-0.633141,,-2.009539,
2000-01-07,,,-0.177538,
2000-01-08,,,-0.581658,-0.016356


In [43]:
df

Unnamed: 0,A,B,C,D
2000-01-01,2.146978,0.874143,-0.045468,-0.667316
2000-01-02,1.252431,0.208039,0.085825,1.167649
2000-01-03,-0.570332,1.735689,-0.624371,1.471479
2000-01-04,0.098023,-0.528085,0.663614,0.81375
2000-01-05,0.121753,-0.822426,-0.748668,0.808512
2000-01-06,-0.633141,0.890745,-2.009539,1.073749
2000-01-07,0.10937,0.824949,-0.177538,1.239689
2000-01-08,0.558801,0.859138,-0.581658,-0.016356


In [44]:
# where() takes an optional 'other' argument that replaces False values
df.where(df < 0, 'other')

Unnamed: 0,A,B,C,D
2000-01-01,other,other,-0.0454682,-0.667316
2000-01-02,other,other,other,other
2000-01-03,-0.570332,other,-0.624371,other
2000-01-04,other,-0.528085,other,other
2000-01-05,other,-0.822426,-0.748668,other
2000-01-06,-0.633141,other,-2.00954,other
2000-01-07,other,other,-0.177538,other
2000-01-08,other,other,-0.581658,-0.0163555


In [45]:
# Add inplace argument
df.where(df < 0, 'other', inplace=True)
df

Unnamed: 0,A,B,C,D
2000-01-01,other,other,-0.0454682,-0.667316
2000-01-02,other,other,other,other
2000-01-03,-0.570332,other,-0.624371,other
2000-01-04,other,-0.528085,other,other
2000-01-05,other,-0.822426,-0.748668,other
2000-01-06,-0.633141,other,-2.00954,other
2000-01-07,other,other,-0.177538,other
2000-01-08,other,other,-0.581658,-0.0163555


In [46]:
t_index = pd.date_range('01/01/2000', periods=8)
df = pd.DataFrame(np.random.randn(8,4),
                 index=t_index, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,0.307384,0.457322,0.340965,-0.86485
2000-01-02,2.407761,-0.469069,-0.347105,0.597627
2000-01-03,1.972368,-2.416447,-0.224599,-0.076663
2000-01-04,-0.084463,0.429262,1.106667,0.830873
2000-01-05,-0.871253,-0.899839,-0.697479,-0.889086
2000-01-06,-0.924763,1.026865,0.480545,0.198664
2000-01-07,2.010489,0.104186,0.277393,0.620773
2000-01-08,-0.095355,0.79712,-0.142874,0.196737


In [47]:
df.where(df > 0, df['A'], axis='index')

Unnamed: 0,A,B,C,D
2000-01-01,0.307384,0.457322,0.340965,0.307384
2000-01-02,2.407761,2.407761,2.407761,0.597627
2000-01-03,1.972368,1.972368,1.972368,1.972368
2000-01-04,-0.084463,0.429262,1.106667,0.830873
2000-01-05,-0.871253,-0.871253,-0.871253,-0.871253
2000-01-06,-0.924763,1.026865,0.480545,0.198664
2000-01-07,2.010489,0.104186,0.277393,0.620773
2000-01-08,-0.095355,0.79712,-0.095355,0.196737


In [48]:
# Mask is the inverse of where
s.mask(s >= 0) # Everything >= 0 will be NaN

4   NaN
3   NaN
2   NaN
1   NaN
0   NaN
dtype: float64

In [49]:
df.mask(df >= 0)

Unnamed: 0,A,B,C,D
2000-01-01,,,,-0.86485
2000-01-02,,-0.469069,-0.347105,
2000-01-03,,-2.416447,-0.224599,-0.076663
2000-01-04,-0.084463,,,
2000-01-05,-0.871253,-0.899839,-0.697479,-0.889086
2000-01-06,-0.924763,,,
2000-01-07,,,,
2000-01-08,-0.095355,,-0.142874,


In [50]:
# Example of query() with index
df = pd.DataFrame(np.random.randn(10,4), columns=['A','B','C','D'])
df.query('index > 2')

Unnamed: 0,A,B,C,D
3,1.292645,-0.24755,1.196289,1.365438
4,-0.301378,-0.710563,-0.434501,-1.895996
5,0.2222,-0.422146,0.244724,-0.0621
6,0.163037,-1.681223,0.171412,-0.078247
7,-0.840757,0.137123,-1.559797,-1.118832
8,0.040331,-0.582139,-0.460669,-0.94586
9,-0.041674,-0.815364,-0.433362,0.192458


In [51]:
df.query('A < B < C')

Unnamed: 0,A,B,C,D
0,-0.864792,-0.117632,1.310392,-0.368756


In [52]:
n = 10
colors = np.random.choice(['red','green'], size=n)
len(colors)

10

In [53]:
# Removing duplicate data
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                    'c': np.random.randn(7)})


In [87]:
df2.duplicated('a')

0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool

In [67]:
# Using keep to indicate which one is unique
df2.duplicated('a', keep='last')

0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool

In [69]:
df2.drop_duplicates('a', keep='last')

Unnamed: 0,a,b,c
1,one,y,-1.011641
4,two,x,0.802726
5,three,x,0.211288
6,four,x,0.017918


In [71]:
#Using a list
df2.duplicated(['a','b'])

0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [73]:
df2.drop_duplicates(['a','b'])

Unnamed: 0,a,b,c
0,one,x,-0.009653
1,one,y,-1.011641
2,two,x,0.233188
3,two,y,1.130403
5,three,x,0.211288
6,four,x,0.017918


In [74]:
# Duplicate indices
df3 = pd.DataFrame({'a': np.arange(6),
                    'b': np.random.randn(6)},
                   index=['a', 'a', 'b', 'c', 'b', 'a'])

In [76]:
df3.index.duplicated()

array([False,  True, False, False,  True,  True])

In [77]:
df3[~df3.index.duplicated(keep='last')]

Unnamed: 0,a,b
c,3,0.9309
b,4,-0.886173
a,5,1.197762


In [79]:
df3[~df3.index.duplicated(keep=False)]

Unnamed: 0,a,b
c,3,0.9309


In [80]:
# Using the dict-like get()
s = pd.Series([1,2,3], index=['a','b','c'])

s.get('a')

1

In [84]:
s.get('x', default=-1)

-1

In [85]:
# The lookup method
df4 = pd.DataFrame(np.random.randn(20,4), columns = ['A','B','C','D'])

# First arg is the row, second the column you want for that row
df4.lookup(list(range(0,10,2)), ['B','C','A','B','D'])

array([-0.35000426,  1.70487396, -0.79512162,  0.65272395,  0.81721816])

In [88]:
# Creating an Index
index = pd.Index(['e','d','a','b'])

In [89]:
# Creating an index with a name
index = pd.Index(['e','d','a','b'], name = 'something')

In [92]:
df5 = pd.DataFrame(np.random.randn(4,))
df5.index = index
df5

Unnamed: 0_level_0,0
something,Unnamed: 1_level_1
e,-0.899451
d,1.137146
a,-0.929942
b,-1.267162


In [98]:
# Set operations on Index objects
a = pd.Index(['c','b','a'])
b = pd.Index(['c','e','d'])

#Union
a | b

# Another way
a.union(b)

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [99]:
#Intersection
a & b

#Another way
a.intersection(b)

Index(['c'], dtype='object')

In [95]:
#Difference
a.difference(b)

Index(['a', 'b'], dtype='object')

In [96]:
# XOR
a ^ b

Index(['a', 'b', 'd', 'e'], dtype='object')

In [97]:
#XOR
a.symmetric_difference(b)

Index(['a', 'b', 'd', 'e'], dtype='object')

In [103]:
# Filling missing values on Index object
idx1 = pd.Index([1,np.nan,3,4])
print(idx1)

idx1.fillna(2)

Float64Index([1.0, nan, 3.0, 4.0], dtype='float64')


Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64')

In [107]:
# Setting an index from columns
data = pd.DataFrame({'a':['bar','bar','foo','foo'], 'b':['one','two','one','two'],
                    'c':['z','y','x','w'], 'd':range(1,5)}, columns=['a','b','c','d'])
data

Unnamed: 0,a,b,c,d
0,bar,one,z,1
1,bar,two,y,2
2,foo,one,x,3
3,foo,two,w,4


In [110]:
data.set_index('c')

Unnamed: 0_level_0,a,b,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
z,bar,one,1
y,bar,two,2
x,foo,one,3
w,foo,two,4


In [112]:
data.set_index(['a','b'])

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,z,1
bar,two,y,2
foo,one,x,3
foo,two,w,4


In [117]:
# Use append to add to an index
frame = data.set_index('c', drop=True) # drop=False keeps 'c' as a column too
frame = frame.set_index(['a','b'], append=True)

frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d
c,a,b,Unnamed: 3_level_1
z,bar,one,1
y,bar,two,2
x,foo,one,3
w,foo,two,4


In [120]:
# Resetting the index is the inverse
frame2 = frame.reset_index() #If multi you can specify a level
frame2

Unnamed: 0,c,a,b,d
0,z,bar,one,1
1,y,bar,two,2
2,x,foo,one,3
3,w,foo,two,4
