## Jacaranda Python 入门课程 - 第五节：金融数据处理工具 Pandas入门
                                                                - By 周洪波

### Question 1: Why use Pandas?

## 1. Simple Data Structure

* Element
* 1D-List
* 2D-List
* Multi-Dimentional List

## 2. Installation & Import

In [67]:
pip install pandas




In [68]:
import pandas as pd

## 3. Create

* Series
* DataFrame
* Index

### Series

pandas.Series(data=None, index=None, name=None)
* data: array-like, Iterable, dict, or scalar value
* index: array-like or Index object
* name: str

In [69]:
ls_1 = ['a','b','c','d','e']
ls_2 = [1,2,3,4,5]
dic_1 = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}

s1 = pd.Series(ls_1)
s2 = pd.Series(ls_2)
s3 = pd.Series(dic_1)

print(s1)
print(s2)
print(s3)

0    a
1    b
2    c
3    d
4    e
dtype: object
0    1
1    2
2    3
3    4
4    5
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64


### Question 2: 
### Why does the first column have different values? 

### DataFrame
pandas.DataFrame(data=None, index=None, columns=None)
* data: ndarray, Iterable, dict or DataFrame
* index: array-like or Index object
* columns: array-like or Index object

In [70]:
df1 = pd.DataFrame(data = ls_1)
df2 = pd.DataFrame(data = ls_2)

# df3 = pd.DataFrame(data = dic_1)
# df3 = pd.DataFrame(data = dic_1, index = [0])

print(df1)
print("\n")
print(df2)
print("\n")
# print(df3)


   0
0  a
1  b
2  c
3  d
4  e


   0
0  1
1  2
2  3
3  4
4  5




In [71]:
dic_2 = {'a':[1], 'b':[2], 'c':[3], 'd':[4], 'e':[5]}
df4 = pd.DataFrame(data = dic_2)

print(df4)

   a  b  c  d  e
0  1  2  3  4  5


In [72]:
df5 = pd.DataFrame(data = [s1, s2])
print(df5)

   0  1  2  3  4
0  a  b  c  d  e
1  1  2  3  4  5


In [73]:
df6 = df5.transpose()
print(df6)

   0  1
0  a  1
1  b  2
2  c  3
3  d  4
4  e  5


### Index
* reindex()
* reset_index()

In [74]:
print(s1)
print(s1.index)
print("\n")

print(s2)
print(s2.index)
print("\n")

print(s3)
print(s3.index)
print("\n")

0    a
1    b
2    c
3    d
4    e
dtype: object
RangeIndex(start=0, stop=5, step=1)


0    1
1    2
2    3
3    4
4    5
dtype: int64
RangeIndex(start=0, stop=5, step=1)


a    1
b    2
c    3
d    4
e    5
dtype: int64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')




In [75]:
print(df1)
print(df1.index)
print('\n')

print(df2)
print(df2.index)
print('\n')

# print(df3)
# print(df3.index)
# print('\n')

print(df4)
print(df4.index)
print('\n')

print(df5)
print(df5.index)

   0
0  a
1  b
2  c
3  d
4  e
RangeIndex(start=0, stop=5, step=1)


   0
0  1
1  2
2  3
3  4
4  5
RangeIndex(start=0, stop=5, step=1)


   a  b  c  d  e
0  1  2  3  4  5
RangeIndex(start=0, stop=1, step=1)


   0  1  2  3  4
0  a  b  c  d  e
1  1  2  3  4  5
RangeIndex(start=0, stop=2, step=1)


In [76]:
index_1 = ['very good','good','medium','bad','very bad']
df1.index = index_1
print(df1)

           0
very good  a
good       b
medium     c
bad        d
very bad   e


In [77]:
index_1.append('extreme bad')
print(index_1)
df1 = df1.reindex(index_1)
print(df1)

['very good', 'good', 'medium', 'bad', 'very bad', 'extreme bad']
               0
very good      a
good           b
medium         c
bad            d
very bad       e
extreme bad  NaN


In [78]:
df1.reset_index()
df1_new = df1.reset_index()
print(df1)
print(df1_new)

               0
very good      a
good           b
medium         c
bad            d
very bad       e
extreme bad  NaN
         index    0
0    very good    a
1         good    b
2       medium    c
3          bad    d
4     very bad    e
5  extreme bad  NaN


In [79]:
df1_new.columns = ['col1','col2']
print(df1_new)

          col1 col2
0    very good    a
1         good    b
2       medium    c
3          bad    d
4     very bad    e
5  extreme bad  NaN


## 4. Delet
df.drop(axis=0, index=None, columns=None, inplace=False)
* axis: {0 or ‘index’, 1 or ‘columns’}, default 0
* index: single label or list-like
* columns: single label or list-like
* inplace: bool, default False

In [83]:
print(df1_new)

          col1 col2
0    very good    a
1         good    b
2       medium    c
3          bad    d
4     very bad    e
5  extreme bad  NaN


In [84]:
df1_new.drop(df1_new.index[1:3])

Unnamed: 0,col1,col2
0,very good,a
3,bad,d
4,very bad,e
5,extreme bad,


In [85]:
df1_new.drop(columns = ['col1'])

Unnamed: 0,col2
0,a
1,b
2,c
3,d
4,e
5,


## 5. Read

### 1-D

In [87]:
df1_new.col2

0      a
1      b
2      c
3      d
4      e
5    NaN
Name: col2, dtype: object

In [88]:
df1_new[0:3]

Unnamed: 0,col1,col2
0,very good,a
1,good,b
2,medium,c


In [89]:
df1_new.iat[0,1]

'a'

In [90]:
df1_new.at[1,'col1']

'good'

### 2-D

In [91]:
df1.head()

Unnamed: 0,0
very good,a
good,b
medium,c
bad,d
very bad,e


In [92]:
df1.tail()

Unnamed: 0,0
good,b
medium,c
bad,d
very bad,e
extreme bad,


In [94]:
df1_new.loc[1]

col1    good
col2       b
Name: 1, dtype: object

In [95]:
df1_new.loc[:,'col2']

0      a
1      b
2      c
3      d
4      e
5    NaN
Name: col2, dtype: object

In [96]:
df1_new.iloc[0:4,1]

0    a
1    b
2    c
3    d
Name: col2, dtype: object

In [97]:
df1_new[df1_new.col1 == 'bad']

Unnamed: 0,col1,col2
3,bad,d


## 6. Update

### Merge
* Concat
* Join
* Append
* Insert

In [None]:
print(df1)
print('\n')
print(df2)


In [None]:
df2.columns = ['col3']
df1_2 = pd.concat([df1,df2],ignore_index = True)
print(df1_2)

In [None]:
df1_2 = df1.join(df2)
print(df1_2)

In [None]:
df4 = df4.transpose()
print(df4)

In [None]:
df1_4 = df1.join(df4)
print(df1_4)

In [None]:
print(df2)
print('\n')
print(df4)

In [None]:
df2_4 = df2.append(df4,ignore_index = True)
print(df2_4)

In [None]:
df4.columns = ['col3']
df2_4 = df2.append(df4,ignore_index = True)
print(df2_4)

In [None]:
df4.insert(1,'1',[6,7,8,9,10])
print(df4)

## Grouping
* groupby

In [1]:
print(df1_4)

NameError: name 'df1_4' is not defined

In [None]:
df1_4.iloc[5,1] = 'b'
print(df1_4)

In [None]:
list(df1_4.groupby('col2'))

### Nan Value
* df.dropna(axis, how, inplace)
* df.fillna(value, method, axis, implace)

In [None]:
print(df1)

In [None]:
df1.dropna(axis = 1, how ='any')

In [None]:
print(df1)

In [None]:
df1.fillna(method = 'pad',axis = 1)

## TimeSeries
* pd.to_datetime()
* pd.resample()

### pd.to_datetime()

In [None]:
s1 = '3/11/2000'
pd.to_datetime(s1)

In [None]:
pd.to_datetime(s1, format = '%d/%m/%Y')

In [None]:
ls4 = ['2000/1/20','2000/1/21','2000/1/22','2000/1/23','2000/1/24','2000/1/25']


In [None]:
s4 = pd.Series(ls4)
print(s4)

In [None]:
df1_5 = df1.merge(s4.rename('col3'),left_index = True, right_index= True)

In [None]:
df1_5['col3'] = pd.to_datetime(df1_5['col3'],format = "%Y/%M/%d")
print(df1_5)

In [None]:
df1_5['col3'] = df1_5['col3'].dt.date

In [None]:
print(df1_5)

### df.resample()


In [None]:
date_rng = pd.date_range("1/20/2000",freq = "D", periods = 100)

In [None]:
ts = pd.Series(range(0,100),index = date_rng)
print(ts)

In [None]:
ts.resample('Q').mean()

In [None]:
ts.resample('H').bfill()