In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [9]:
###### Reindex
# Which mean to create a new object with the data conformed to a new index.
obj = Series(np.random.randint(100,size=4),index = ['d','b','a','c'])
print(obj)
print()
obj2 = obj.reindex(['a','b','c','d','e']) # introducing missing value if any index values were not already present
print("after reindexing")
print(obj2)
print()
obj3 = obj.reindex(['a','b','c','d','e'],fill_value=0)
print("reindexing, and fill the NaN value")
print(obj3)

d    27
b    34
a    56
c    40
dtype: int64

after reindexing
a    56.0
b    34.0
c    40.0
d    27.0
e     NaN
dtype: float64

reindexing, and fill the NaN value
a    56
b    34
c    40
d    27
e     0
dtype: int64


In [11]:
# reindexing and interpolation
obj4 = Series(['blue','purple','yellow'],index=[0,2,4])
print(obj4)
print()
obj5 = obj4.reindex(range(6),method='ffill')
print("after reindexing and interpolation with 'ffill' method")
print(obj5)

0      blue
2    purple
4    yellow
dtype: object

after reindexing and interpolation with 'ffill' method
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


In [19]:
# reindex in DataFrame

frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','Califonia'])
print(frame)
print()

frame2 = frame.reindex(['a','b','c','d']) # operate the row index by default
print(frame2)
print()

states = ['Texas','Utah','Califonia']
frame3 = frame.reindex(columns=states) # reindex the columns 
print(frame3)
print()

frame4 = frame.reindex(index=['a','b','c','d'],columns=['Texas','Utah','Califonia'],method='ffill')
print(frame4)
print()

# reindexing can be done more succinctly by label~indexing with ix:
frame.ix[['a','b','c','d'],states]

   Ohio  Texas  Califonia
a     0      1          2
c     3      4          5
d     6      7          8

   Ohio  Texas  Califonia
a   0.0    1.0        2.0
b   NaN    NaN        NaN
c   3.0    4.0        5.0
d   6.0    7.0        8.0

   Texas  Utah  Califonia
a      1   NaN          2
c      4   NaN          5
d      7   NaN          8

   Texas  Utah  Califonia
a      1   NaN          2
b      1   NaN          2
c      4   NaN          5
d      7   NaN          8



Unnamed: 0,Texas,Utah,Califonia
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [35]:
#### Dropping entries from an axis
print("For Series:  ")
obj = Series(np.arange(5.0),index=['a','b','c','d','e'])
print(obj)
print()
obj2 = obj.drop(['b','d'])
print("after dropping 'b' and 'd'")
print(obj2)
print("-------------------------------")

print("With DataFrame")
frame = DataFrame(np.arange(16).reshape(4,4),index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
print(frame)
print()
frame1 = frame.drop(['Ohio','Utah']) # drop row by default
print(frame1)
print()
frame2 = frame.drop(['two','three'],axis=1) # drop columns
print(frame2)
print()

For Series:  
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

after dropping 'b' and 'd'
a    0.0
c    2.0
e    4.0
dtype: float64
-------------------------------
With DataFrame
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15

          one  two  three  four
Colorado    4    5      6     7
New York   12   13     14    15

          one  four
Ohio        0     3
Colorado    4     7
Utah        8    11
New York   12    15



In [59]:
#### Indexing, Selection, and Filtering

# Series indexing works analogously to NumPy array indexing, except you can use the Series's index values instead 
# of integers. Here just introduce the use of DataFrame

frame = DataFrame(np.arange(16).reshape(4,4),index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
print("original data:")
print(frame)
print("-----------------------")

print("get 'two' column:")
print(frame['two'])
print("-----------------------")

print("get 'two' and 'three' colum : ")
print(frame[['two','three']])
print("-----------------------")

print("select rows by slicing")
print(frame[:2])
print("select rows by boolean array")
print(frame[frame['three']>5])
print("-----------------------")

print("Indexing with a boolean DataFrame")
print( frame < 5 )
frame[frame < 5] = 0
print(frame)
print("-----------------------")
print()

# For DataFarame, there is a special indexing field 'ix'. It enables you to select a subset of the rows and columns
# from a DataFrame with NumPy like notation plus axis labels.
# .ix[row_indexs,colum_indexs]
print(frame.ix['Colorado',['two','three']])
print()
print(frame.ix[['Colorado','Utah'],['two','three']])
print(frame.ix[['Colorado','Utah'],[3,0,1]])
print(frame.ix[2]) # the second row
print(frame.ix[:'Utah','two'])
print(frame.ix[frame.three>3,:3])

original data:
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
-----------------------
get 'two' column:
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
-----------------------
get 'two' and 'three' colum : 
          two  three
Ohio        1      2
Colorado    5      6
Utah        9     10
New York   13     14
-----------------------
select rows by slicing
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
select rows by boolean array
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
-----------------------
Indexing with a boolean DataFrame
            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  Fa

In [65]:
#### Arithmetic and data aligment
# Operations between two Series or DataFrame operand, all based on index (row index) and columns
# When adding together object, if any index pairs are not the same, the respective index in the result will be 
# the union of the index pairs.

df1 = DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'),index=['Beijing','Nanning','Guangzhou'])
df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Beijing','Nanjing','GuangZhou','Tianjin'])
print(df1)
print(df2)
print()
print('df1 + df2')
print(df1 + df2)
print()
print('----------------------------')
print("using the add method and an argument to fill_value")
print(df1.add(df2,fill_value=0))

             b    c    d
Beijing    0.0  1.0  2.0
Nanning    3.0  4.0  5.0
Guangzhou  6.0  7.0  8.0
             b     d     e
Beijing    0.0   1.0   2.0
Nanjing    3.0   4.0   5.0
GuangZhou  6.0   7.0   8.0
Tianjin    9.0  10.0  11.0

df1 + df2
             b   c    d   e
Beijing    0.0 NaN  3.0 NaN
GuangZhou  NaN NaN  NaN NaN
Guangzhou  NaN NaN  NaN NaN
Nanjing    NaN NaN  NaN NaN
Nanning    NaN NaN  NaN NaN
Tianjin    NaN NaN  NaN NaN

----------------------------
using the add method and an argument to fill_value
             b    c     d     e
Beijing    0.0  1.0   3.0   2.0
GuangZhou  6.0  NaN   7.0   8.0
Guangzhou  6.0  7.0   8.0   NaN
Nanjing    3.0  NaN   4.0   5.0
Nanning    3.0  4.0   5.0   NaN
Tianjin    9.0  NaN  10.0  11.0
