# Dataframes

In [2]:
import numpy as np
import pandas as pd

In [3]:
from numpy.random import rand

In [4]:
np.random.seed(101)

In [16]:
df=pd.DataFrame(rand(5,4),index='A,B,C,D,E'.split(','),columns="a b c d".split(" "))

In [17]:
df

Unnamed: 0,a,b,c,d
A,0.625773,0.10085,0.710853,0.88133
B,0.69529,0.266173,0.909964,0.579618
C,0.359687,0.852634,0.246179,0.780444
D,0.04551,0.885041,0.276421,0.580924
E,0.910662,0.992591,0.062224,0.928481


## Selection and Indexing

In [18]:
df['a']

A    0.625773
B    0.695290
C    0.359687
D    0.045510
E    0.910662
Name: a, dtype: float64

In [27]:
print(df.loc['A'])
# or    
print(df.iloc[0])


a    0.625773
b    0.100850
c    0.710853
d    0.881330
Name: A, dtype: float64
a    0.625773
b    0.100850
c    0.710853
d    0.881330
Name: A, dtype: float64


In [28]:
df['new']=df['a']+df['b']

In [29]:
df

Unnamed: 0,a,b,c,d,new
A,0.625773,0.10085,0.710853,0.88133,0.726623
B,0.69529,0.266173,0.909964,0.579618,0.961463
C,0.359687,0.852634,0.246179,0.780444,1.212321
D,0.04551,0.885041,0.276421,0.580924,0.930551
E,0.910662,0.992591,0.062224,0.928481,1.903253


In [38]:
df.drop('new',axis=1,inplace=True)

In [39]:
df

Unnamed: 0,a,b,c,d
A,0.625773,0.10085,0.710853,0.88133
B,0.69529,0.266173,0.909964,0.579618
C,0.359687,0.852634,0.246179,0.780444
D,0.04551,0.885041,0.276421,0.580924
E,0.910662,0.992591,0.062224,0.928481


# Multi Index and Index Hierarchy   

In [42]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
print(hier_index)
hier_index = pd.MultiIndex.from_tuples(hier_index)
print(hier_index)

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )


In [45]:
df=pd.DataFrame(rand(6,5),index=hier_index)

In [56]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4
Group,Num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
G1,1,0.603498,0.45442,0.313479,0.380982,0.642812
G1,2,0.806533,0.130528,0.730565,0.005438,0.650661
G1,3,0.453049,0.200645,0.631129,0.017791,0.838551
G2,1,0.525546,0.370938,0.42575,0.265095,0.446408
G2,2,0.982653,0.752054,0.520926,0.462383,0.067457
G2,3,0.62284,0.719619,0.450736,0.684777,0.732571


In [52]:
df.loc['G1']

Unnamed: 0,0,1,2,3,4
1,0.603498,0.45442,0.313479,0.380982,0.642812
2,0.806533,0.130528,0.730565,0.005438,0.650661
3,0.453049,0.200645,0.631129,0.017791,0.838551


In [55]:
df.index.names=['Group','Num']

In [58]:
df.xs("G1")

Unnamed: 0_level_0,0,1,2,3,4
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.603498,0.45442,0.313479,0.380982,0.642812
2,0.806533,0.130528,0.730565,0.005438,0.650661
3,0.453049,0.200645,0.631129,0.017791,0.838551


In [59]:
df.xs(["G1",1])

0    0.603498
1    0.454420
2    0.313479
3    0.380982
4    0.642812
Name: (G1, 1), dtype: float64

# Handling Missing Values

In [65]:
df=pd.DataFrame({'A':[1,2,3],
'B':[2,np.nan,3],
'C':[1,np.nan,np.nan]
}
)

In [66]:
df

Unnamed: 0,A,B,C
0,1,2.0,1.0
1,2,,
2,3,3.0,


In [68]:
df.dropna()

Unnamed: 0,A,B,C
0,1,2.0,1.0


In [69]:
df.dropna(axis=1)


Unnamed: 0,A
0,1
1,2
2,3


In [70]:
df.dropna(thresh=2)


Unnamed: 0,A,B,C
0,1,2.0,1.0
2,3,3.0,


In [71]:
df.fillna(value="Thresh")

Unnamed: 0,A,B,C
0,1,2,1
1,2,Thresh,Thresh
2,3,3,Thresh


# Groupby