In [1]:
import numpy as np
import pandas as pd

# Using Series in Pandas

In [2]:
labels = ['a', 'b', 'c']
data = [10, 20, 30]
arr = np.array(data)

In [4]:
d = dict(zip(labels,data))
d

{'a': 10, 'b': 20, 'c': 30}

In [9]:
pd.Series(data = data)

0    10
1    20
2    30
dtype: int64

In [11]:
pd.Series(data = data, index = labels)

a    10
b    20
c    30
dtype: int64

In [13]:
# using numpy array
pd.Series(data = arr, index = labels)

a    10
b    20
c    30
dtype: int32

In [16]:
# data as others objects
pd.Series(labels, index = data)

10    a
20    b
30    c
dtype: object

In [27]:
ser1 = pd.Series([1,2,3,4], ['US', 'GH', 'NG','UK'])
ser1

US    1
GH    2
NG    3
UK    4
dtype: int64

In [34]:
ser2 = pd.Series([1,2,5,4], ['US', 'GH', 'SA','UK'])
ser2

US    1
GH    2
SA    5
UK    4
dtype: int64

In [31]:
ser1 + ser2

GH    4.0
NG    NaN
SA    NaN
UK    8.0
US    2.0
dtype: float64

In [33]:
ser1['GH']

2

# Next Step Using DataFrames in Pandas

In [37]:
import numpy as np
import pandas as pd

from numpy.random import randn

In [36]:
np.random.seed(101)

In [39]:
df  = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

In [48]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [49]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [47]:
type(df)

pandas.core.frame.DataFrame

In [66]:
type(df['W'])

pandas.core.series.Series

In [None]:
# Columns

In [52]:
df[['X', 'Y']]

Unnamed: 0,X,Y
A,1.693723,-1.706086
B,0.390528,0.166905
C,0.07296,0.638787
D,-0.75407,-0.943406
E,1.901755,0.238127


In [55]:
df['new'] = df['W'] + df['Z']

In [57]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.302665,1.693723,-1.706086,-1.159119,-0.856454
B,-0.134841,0.390528,0.166905,0.184502,0.049661
C,0.807706,0.07296,0.638787,0.329646,1.137352
D,-0.497104,-0.75407,-0.943406,0.484752,-0.012352
E,-0.116773,1.901755,0.238127,1.996652,1.879879


In [59]:
df.drop('new', axis = 1)

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [60]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.302665,1.693723,-1.706086,-1.159119,-0.856454
B,-0.134841,0.390528,0.166905,0.184502,0.049661
C,0.807706,0.07296,0.638787,0.329646,1.137352
D,-0.497104,-0.75407,-0.943406,0.484752,-0.012352
E,-0.116773,1.901755,0.238127,1.996652,1.879879


In [61]:
df.drop('new', axis = 1, inplace =True)

In [63]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [64]:
df.shape

(5, 4)

In [67]:
# Rows

In [68]:
# using labels based sys
df.loc['A']

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [69]:
# using integer index
df.iloc[0]

W    0.302665
X    1.693723
Y   -1.706086
Z   -1.159119
Name: A, dtype: float64

In [70]:
# multiple rows
df.loc[['A', 'B']]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502


In [71]:
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905


In [72]:
# query single values
df.loc['B','Y']

0.16690463609281317

## More on Pandas DataFrames

In [73]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,False,True,True,True
C,True,True,True,True
D,False,False,False,True
E,False,True,True,True


In [74]:
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,,
B,,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,,,,0.484752
E,,1.901755,0.238127,1.996652


In [75]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
C,0.807706,0.07296,0.638787,0.329646


In [76]:
df[df['Z'] < 0]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119


In [87]:
resultdf = df[df['W'] > 0]['X']

In [81]:
resultdf['X']

A    1.693723
C    0.072960
Name: X, dtype: float64

In [89]:
resultdf1 = df[df['W'] > 0][['X', 'Y']]
resultdf1

Unnamed: 0,X,Y
A,1.693723,-1.706086
C,0.07296,0.638787


In [90]:
# querysing multple conditions NB: normal python 'and' & 'or' does not work '&' is used
df[(df['W'] > 0 ) & (df['Y'] < 0)]

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119


In [92]:
df.reset_index() # to make it permanent: df.reset_index(inplace = True)

Unnamed: 0,index,W,X,Y,Z
0,A,0.302665,1.693723,-1.706086,-1.159119
1,B,-0.134841,0.390528,0.166905,0.184502
2,C,0.807706,0.07296,0.638787,0.329646
3,D,-0.497104,-0.75407,-0.943406,0.484752
4,E,-0.116773,1.901755,0.238127,1.996652


In [94]:
newind = 'CA GH NG UK CH'.split()
newind

['CA', 'GH', 'NG', 'UK', 'CH']

In [96]:
df['Country'] = newind

In [98]:
df

Unnamed: 0,W,X,Y,Z,Country
A,0.302665,1.693723,-1.706086,-1.159119,CA
B,-0.134841,0.390528,0.166905,0.184502,GH
C,0.807706,0.07296,0.638787,0.329646,NG
D,-0.497104,-0.75407,-0.943406,0.484752,UK
E,-0.116773,1.901755,0.238127,1.996652,CH


In [100]:
df.set_index('Country') # to make it permanent df.set_index('Country', inplace = True)

Unnamed: 0_level_0,W,X,Y,Z
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.302665,1.693723,-1.706086,-1.159119
GH,-0.134841,0.390528,0.166905,0.184502
NG,0.807706,0.07296,0.638787,0.329646
UK,-0.497104,-0.75407,-0.943406,0.484752
CH,-0.116773,1.901755,0.238127,1.996652


In [101]:
df

Unnamed: 0,W,X,Y,Z,Country
A,0.302665,1.693723,-1.706086,-1.159119,CA
B,-0.134841,0.390528,0.166905,0.184502,GH
C,0.807706,0.07296,0.638787,0.329646,NG
D,-0.497104,-0.75407,-0.943406,0.484752,UK
E,-0.116773,1.901755,0.238127,1.996652,CH


### Let Kill more DataFrames from Pandas
this part 3 of dataFrame Videos

In [121]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside)) 
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [104]:
hier_index 

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [106]:
dfx = pd.DataFrame(randn(6,2), hier_index, ['A','B'])
dfx

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.147027,-0.479448
G1,2,0.558769,1.02481
G1,3,-0.925874,1.862864
G2,1,-1.133817,0.610478
G2,2,0.38603,2.084019
G2,3,-0.376519,0.230336


In [109]:
dfx.loc['G1']

Unnamed: 0,A,B
1,0.147027,-0.479448
2,0.558769,1.02481
3,-0.925874,1.862864


In [114]:
dfx.loc['G1'].loc[2]

A    0.558769
B    1.024810
Name: 2, dtype: float64

In [115]:
dfx.loc['G1'].loc[2]['B']

1.0248102783372157

In [112]:
dfx.index.names = ['Groups', 'Num']

In [113]:
dfx

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.147027,-0.479448
G1,2,0.558769,1.02481
G1,3,-0.925874,1.862864
G2,1,-1.133817,0.610478
G2,2,0.38603,2.084019
G2,3,-0.376519,0.230336


In [118]:
# the cross section method .xs()
dfx.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.147027,-0.479448
2,0.558769,1.02481
3,-0.925874,1.862864


In [119]:
dfx

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.147027,-0.479448
G1,2,0.558769,1.02481
G1,3,-0.925874,1.862864
G2,1,-1.133817,0.610478
G2,2,0.38603,2.084019
G2,3,-0.376519,0.230336


In [117]:
dfx.xs(2, level ='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.558769,1.02481
G2,0.38603,2.084019
