In [3]:
%pwd

'/home/vivek/Documents/Projects/pandas'

In [2]:
import pandas as pd
import numpy as np

# Basic Indexing Techniques

In [5]:
df = pd.DataFrame(np.random.randn(10, 3), columns=['first', 'second', 'third'])
df['alphabets'] = list('abcdefghij')

In [6]:
# When you no index, you can use numeric indices to get a specific row
# You can use .ix property like so
df.ix[5]

# You can also use the .loc property like so
df.loc[[5, 3, 7]]

# You can also use .iloc, which is the same as the two above like so
df.iloc[0:10:2]

# All these indexing techniques are very much like using the list operations on python lists
# But there are difference, here is what and this will make sense whenever the index column is a str type of column

## - .ix can take either numberic or any value of the index column
## - .loc will only take a value from the index column
## - .iloc will only take numberic index i.e. 0 to n

# To simplify = whenever you want to use str indexing, use .loc
#  and incase of numeric default indexes i.e. 0 to n, use the .iloc

Unnamed: 0,first,second,third,alphabets
0,-0.094436,2.056755,-2.168884,a
2,0.910477,-0.592088,-0.674787,c
4,1.172892,-1.142578,-0.456884,e
6,0.722584,-0.227838,-0.881484,g
8,-0.53818,-0.833391,-0.895439,i


In [7]:
df.reset_index(inplace=True)
df.set_index('first', inplace=True)

In [8]:
df.iloc[4]

index               4
second       -1.14258
third       -0.456884
alphabets           e
Name: 1.17289158796, dtype: object

In [9]:
df.reset_index(inplace=True)
df.set_index('alphabets', inplace=True)

In [10]:
# You can call multiple rows like so
df.loc[['a', 'j']]

Unnamed: 0_level_0,first,index,second,third
alphabets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,-0.094436,0,2.056755,-2.168884
j,2.837803,9,1.407676,0.844965


# Date Indexing

In [11]:
rows = 10
columns = ['A', 'B', 'C', 'D']

dates = pd.date_range('1/1/2016', periods=rows)
df_dates = pd.DataFrame(np.random.randn(rows, len(columns)), index=dates, columns=columns)

In [12]:
df_dates

Unnamed: 0,A,B,C,D
2016-01-01,-0.001779,0.474214,-0.107257,-0.183005
2016-01-02,-1.407387,-0.359448,-0.93042,0.204374
2016-01-03,0.646236,0.904199,-0.40064,0.119176
2016-01-04,0.677593,1.008918,1.047107,-1.951302
2016-01-05,0.266513,1.575322,-0.58087,-0.912605
2016-01-06,-1.709773,1.997328,-0.516335,-0.372171
2016-01-07,-0.030324,-1.123634,0.818244,-0.087174
2016-01-08,-0.094829,1.557835,-1.161468,-0.554727
2016-01-09,0.272999,0.965043,0.92641,-1.397614
2016-01-10,-0.440957,0.612414,-1.514343,-0.937468


In [13]:
type(df_dates.index)

pandas.tseries.index.DatetimeIndex

# Multi-indexing Techniques

In [14]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

# In order to create tuples, you have to make sure both arrays are of the same length
tuples = list(zip(*arrays))
# Once you have the tuples ready, generate index from tuples
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
# Create a series with the index
pd.Series(np.random.randn(len(arrays[0])), index=index)

first  second
bar    one       0.785469
       two      -0.126358
baz    one      -0.360774
       two       1.163495
foo    one      -0.899611
       two      -1.712077
qux    one      -0.674545
       two       1.030811
dtype: float64

In [15]:
# Also, you can create multiple columns of data by adding another parameter to the np.random.randn function
##  but in which case you will have to create a dataframe instaed of a Series, like so...
pd.DataFrame(np.random.randn(8, 4), index=arrays)


# Here order of the sub-arrays also matter within the 'arrays' variable

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,1.090998,0.440035,0.210136,0.237818
bar,two,0.531247,-0.120463,1.010341,0.052884
baz,one,0.086243,-0.433254,0.390547,-0.534293
baz,two,-0.617314,1.147554,-1.071946,-0.852245
foo,one,-0.172432,1.378473,-1.836638,0.17081
foo,two,-2.050112,-0.711727,0.552812,2.05844
qux,one,0.954183,1.728802,0.81122,0.22397
qux,two,0.139435,0.825081,-0.260215,1.581042


In [16]:
complex_df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=arrays)
complex_df

Unnamed: 0_level_0,bar,bar,baz,baz,foo,foo,qux,qux
Unnamed: 0_level_1,one,two,one,two,one,two,one,two
A,-0.461102,1.309161,-1.562988,-0.877002,1.637179,-1.138206,0.394045,0.238285
B,0.727188,-0.234886,-0.487778,1.665635,1.25826,-0.630219,-0.985442,0.832004
C,1.961256,-0.854996,-1.225799,-0.689505,-1.187653,-0.641681,0.012988,0.210172


In [17]:
complex_df.columns.levels

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [18]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [19]:
df_using_tuples = pd.DataFrame(np.random.randn(len(tuples), len(tuples[0])), index=tuples)
df_using_tuples

Unnamed: 0,0,1
"(bar, one)",0.630339,0.258295
"(bar, two)",0.199821,-1.614464
"(baz, one)",-0.816744,-1.279449
"(baz, two)",0.279139,-0.175089
"(foo, one)",-0.637521,-0.628924
"(foo, two)",0.04452,1.762567
"(qux, one)",-0.723407,-1.985872
"(qux, two)",-0.836941,-0.784032


In [20]:
# similarily, you can create a multi-index using a product where the length of 2 arrays don't need to be of different lengths
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]
prod = pd.MultiIndex.from_product(iterables, names=['first', 'second'])

pd.Series(np.random.randn(len(iterables[0]) * len(iterables[1])), index=prod)

first  second
bar    one       0.285908
       two      -0.064803
baz    one      -0.321029
       two       0.481149
foo    one       0.634045
       two      -0.452175
qux    one      -0.392703
       two      -0.609260
dtype: float64

In [21]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [22]:
s = pd.Series(np.random.randn(8), index=index)
s

first  second
bar    one      -0.051610
       two       0.180586
baz    one      -1.005825
       two      -0.413576
foo    one       0.875953
       two       1.110857
qux    one      -0.450947
       two       0.592531
dtype: float64

## You can use the .loc method to select complex conditional data

In [8]:
df1 = pd.DataFrame(np.random.rand(6, 4), columns=list('ABCD'), index=list('abcdef'))

In [9]:
df1

Unnamed: 0,A,B,C,D
a,0.846315,0.919258,0.772414,0.905809
b,0.510285,0.281321,0.292912,0.445649
c,0.345967,0.664791,0.093477,0.999455
d,0.962213,0.993047,0.60314,0.692149
e,0.840637,0.715874,0.443643,0.724201
f,0.57674,0.266072,0.696912,0.031046


In [13]:
## CHECK THIS OUT
# You are basically doing the following
##    SELECT df1.A, df1.B from df1 WHERE A > 0.7
df1.loc[lambda df: df['A'] > 0.7, ["A", "B"]]

Unnamed: 0,A,B
a,0.846315,0.919258
d,0.962213,0.993047
e,0.840637,0.715874


In [17]:
##    SELECT * FROM df1 WHERE df1.A < 0.5
df1.loc[lambda df: df['A'] <= 0.5]

Unnamed: 0,A,B,C,D
c,0.345967,0.664791,0.093477,0.999455


In [19]:
# Similarly, you can also select all rows but conditional columns like so
df1.loc[:, lambda df: ["A", "B"]]
## SELECT df1.A, df1.B FROM df1

Unnamed: 0,A,B
a,0.846315,0.919258
b,0.510285,0.281321
c,0.345967,0.664791
d,0.962213,0.993047
e,0.840637,0.715874
f,0.57674,0.266072


In [27]:
df1.sample(n=2, replace=False)

Unnamed: 0,A,B,C,D
a,0.846315,0.919258,0.772414,0.905809
b,0.510285,0.281321,0.292912,0.445649


In [28]:
df1

Unnamed: 0,A,B,C,D
a,0.846315,0.919258,0.772414,0.905809
b,0.510285,0.281321,0.292912,0.445649
c,0.345967,0.664791,0.093477,0.999455
d,0.962213,0.993047,0.60314,0.692149
e,0.840637,0.715874,0.443643,0.724201
f,0.57674,0.266072,0.696912,0.031046
