# DataFrames

In [2]:
import pandas as pd
import numpy as np

from numpy.random import randn

In [3]:
np.random.seed(100)

In [4]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


## Selection and Indexing

In [6]:
df["W"]

A   -1.749765
B    0.981321
C   -0.189496
D   -0.583595
E   -0.531280
Name: W, dtype: float64

In [7]:
type(df["W"])

pandas.core.series.Series

In [8]:
df[['W','Z']]

Unnamed: 0,W,Z
A,-1.749765,-0.252436
B,0.981321,-1.070043
C,-0.189496,0.435163
D,-0.583595,-0.104411
E,-0.53128,-1.118318


## Creating a new column:

In [9]:
df['new'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,-1.749765,0.34268,1.153036,-0.252436,-0.59673
B,0.981321,0.514219,0.22118,-1.070043,1.2025
C,-0.189496,0.255001,-0.458027,0.435163,-0.647523
D,-0.583595,0.816847,0.672721,-0.104411,0.089126
E,-0.53128,1.029733,-0.438136,-1.118318,-0.969416


## Drop

In [10]:
df.drop('new',axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


### drop rows

In [11]:
df.drop('E',axis=0)

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411


In [12]:
df

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411
E,-0.53128,1.029733,-0.438136,-1.118318


In [13]:
df.drop('E',axis=0, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411


## Shape

In [14]:
df.shape

(4, 4)

## Select rows

In [15]:
df.loc["A"]

W   -1.749765
X    0.342680
Y    1.153036
Z   -0.252436
Name: A, dtype: float64

In [16]:
df.iloc[2]

W   -0.189496
X    0.255001
Y   -0.458027
Z    0.435163
Name: C, dtype: float64

In [17]:
df.loc['B','Y']

0.22117966922140045

In [18]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,-1.749765,1.153036
B,0.981321,0.22118


### Conditional Selection

In [22]:
df

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411


In [19]:
df > 0

Unnamed: 0,W,X,Y,Z
A,False,True,True,False
B,True,True,True,False
C,False,True,False,True
D,False,True,True,False


In [20]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,0.34268,1.153036,
B,0.981321,0.514219,0.22118,
C,,0.255001,,0.435163
D,,0.816847,0.672721,


In [26]:
res = df[df['W']<0]

In [27]:
res

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411


In [28]:
res["X"]

A    0.342680
C    0.255001
D    0.816847
Name: X, dtype: float64

Or

In [29]:
df[df['W']<0]['X']

A    0.342680
C    0.255001
D    0.816847
Name: X, dtype: float64

In [30]:
df[df['W']<0][['Y','X']]

Unnamed: 0,Y,X
A,1.153036,0.34268
C,-0.458027,0.255001
D,0.672721,0.816847


For two conditions

In [31]:
df[(df['W']<0) & (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436


## Index

In [32]:
df

Unnamed: 0,W,X,Y,Z
A,-1.749765,0.34268,1.153036,-0.252436
B,0.981321,0.514219,0.22118,-1.070043
C,-0.189496,0.255001,-0.458027,0.435163
D,-0.583595,0.816847,0.672721,-0.104411


In [33]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-1.749765,0.34268,1.153036,-0.252436
1,B,0.981321,0.514219,0.22118,-1.070043
2,C,-0.189496,0.255001,-0.458027,0.435163
3,D,-0.583595,0.816847,0.672721,-0.104411


In [35]:
newind = 'CA NY WY OR'.split()
df['States'] = newind

In [36]:
df

Unnamed: 0,W,X,Y,Z,States
A,-1.749765,0.34268,1.153036,-0.252436,CA
B,0.981321,0.514219,0.22118,-1.070043,NY
C,-0.189496,0.255001,-0.458027,0.435163,WY
D,-0.583595,0.816847,0.672721,-0.104411,OR


In [37]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-1.749765,0.34268,1.153036,-0.252436
NY,0.981321,0.514219,0.22118,-1.070043
WY,-0.189496,0.255001,-0.458027,0.435163
OR,-0.583595,0.816847,0.672721,-0.104411


In [38]:
df

Unnamed: 0,W,X,Y,Z,States
A,-1.749765,0.34268,1.153036,-0.252436,CA
B,0.981321,0.514219,0.22118,-1.070043,NY
C,-0.189496,0.255001,-0.458027,0.435163,WY
D,-0.583595,0.816847,0.672721,-0.104411,OR


In [39]:
df.set_index('States',inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,-1.749765,0.34268,1.153036,-0.252436
NY,0.981321,0.514219,0.22118,-1.070043
WY,-0.189496,0.255001,-0.458027,0.435163
OR,-0.583595,0.816847,0.672721,-0.104411


## Multi-Index

In [40]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [41]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [42]:
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.618982,1.541605
G1,2,-0.251879,-0.842436
G1,3,0.184519,0.937082
G2,1,0.731,1.361556
G2,2,-0.326238,0.055676
G2,3,0.2224,-1.443217


In [43]:
df.loc['G1']

Unnamed: 0,A,B
1,1.618982,1.541605
2,-0.251879,-0.842436
3,0.184519,0.937082


In [45]:
df.loc['G1'].loc[1]

A    1.618982
B    1.541605
Name: 1, dtype: float64

In [48]:
df.loc['G1'].loc[1]["B"]

1.5416051745134067

In [44]:
df.index.names

FrozenList([None, None])

In [46]:
df.index.names = ['Group','Num']

In [47]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.618982,1.541605
G1,2,-0.251879,-0.842436
G1,3,0.184519,0.937082
G2,1,0.731,1.361556
G2,2,-0.326238,0.055676
G2,3,0.2224,-1.443217


In [49]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.618982,1.541605
2,-0.251879,-0.842436
3,0.184519,0.937082


In [51]:
df.xs(('G1',1))

A    1.618982
B    1.541605
Name: (G1, 1), dtype: float64

In [52]:
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,1.618982,1.541605
G2,0.731,1.361556
