<a href="https://colab.research.google.com/github/jackqk/pandas-note/blob/master/DataFrame_CRUD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DataFrame CRUD**

# **一、Creation**

## Row 创建

In [0]:
# list+dict
import pandas as pd
df = pd.DataFrame([{'growth':.5, 'Name':'Paul'}, {'growth':.7, 'Name':'George'}, {'growth':1.2, 'Name':'Ringo'}])
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George
2,1.2,Ringo


In [0]:
# list+tuple+index
df = pd.DataFrame([(5, 'Paul'), (7, 'George'), (1.2, 'Ringo')], columns=['growth', 'Name'])
df

Unnamed: 0,growth,Name
0,5.0,Paul
1,7.0,George
2,1.2,Ringo


In [0]:
s1 = pd.Series({'growth':.5, 'Name':'Paul'})
s2 = pd.Series({'growth':.7, 'Name':'George'})
s3 = pd.Series({'growth':1.2, 'Name':'Ringo'})
df = pd.DataFrame([s1, s2, s3])
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George
2,1.2,Ringo


## Columns 创建

In [0]:
# dict+list
import pandas as pd
df = pd.DataFrame({'growth':[.5, .7, 1.2], 'Name':['Paul', 'George', 'Ringo']})
df

Unnamed: 0,growth,Name
0,0.5,Paul
1,0.7,George
2,1.2,Ringo


## Numpy 创建

In [0]:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,-0.495711,0.843352,-0.124422
1,0.680442,-1.068099,0.058544
2,0.898556,0.885288,-0.003347
3,-0.63726,-0.645002,0.005564
4,-2.139233,0.794628,-1.214997
5,-0.313396,0.128956,-0.105759
6,0.590068,0.173256,-1.797408
7,0.677359,1.169891,0.928835
8,-0.056904,-0.170867,2.05672
9,0.378893,0.445405,-1.657161


## CSV创建

In [0]:
import pandas as pd
url = 'https://raw.githubusercontent.com/irJERAD/Intro-to-Data-Science-in-Python/master/MyNotebooks/olympics.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !,02 !,03 !,Total,№ Games,01 !,02 !,03 !,Combined total
1,Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
2,Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
3,Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
4,Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12


In [0]:
#针对上面index和column name的值做一下改进
df = pd.read_csv(url, index_col=0, skiprows=1)
for col in df.columns:
  if col[:2] == '01':
    df.rename(columns={col:'Gold'+col[4:]}, inplace=True)
  if col[:2] == '02':
    df.rename(columns={col : 'Silver'+col[4:]}, inplace=True)
  if col[:2] == '03':
    df.rename(columns={col : 'Bronze' + col[4:]}, inplace=True)
  if col[:1] == '№':
    df.rename(columns={col: '#' + col[2:]}, inplace=True)
df.head()

Unnamed: 0,#Summer,Gold,Silver,Bronze,Total,#Winter,Gold.1,Silver.1,Bronze.1,Total.1,#Games,Gold.2,Silver.2,Bronze.2,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12


# **二、Insert**

## 插入行

In [0]:
import pandas as pd
url = 'https://raw.githubusercontent.com/jackqk/pandas-note/master/data/sample1.csv'
df = pd.read_csv(url)
df

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%
2,Bald Mountain,42.46,7869,12593.0,47.6%
3,Pence Point,43.99,7521,12813.0,48.4%
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%
5,Alexander Springs,47.97,5956,13319.0,50.3%
6,Rogers Trail junction,49.52,6698,13967.0,52.8%
7,Rogers Saddle,49.77,6790,14073.0,53.2%
8,Railroad Bed,50.15,6520,,
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%


In [0]:
df2 = pd.DataFrame([('Lambs Trail', 54.14, 6628, 14805, '56.0%')], columns=['LOCATION', 'MILES', 'ELEVATION', 'CUMUL', '% CUMUL GAIN'])
df2

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,% CUMUL GAIN
0,Lambs Trail,54.14,6628,14805,56.0%


In [0]:
# concat
# 原始数据不会修改
# ignore_index=False，看最后数字索引
pd.concat([df, df2], ignore_index=True, sort=False)

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN,% CUMUL GAIN
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%,
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%,
2,Bald Mountain,42.46,7869,12593.0,47.6%,
3,Pence Point,43.99,7521,12813.0,48.4%,
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%,
5,Alexander Springs,47.97,5956,13319.0,50.3%,
6,Rogers Trail junction,49.52,6698,13967.0,52.8%,
7,Rogers Saddle,49.77,6790,14073.0,53.2%,
8,Railroad Bed,50.15,6520,,,
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%,


In [0]:
#用法与上面的差不多
df.append(df2, sort=False, ignore_index=True)

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN,% CUMUL GAIN
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%,
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%,
2,Bald Mountain,42.46,7869,12593.0,47.6%,
3,Pence Point,43.99,7521,12813.0,48.4%,
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%,
5,Alexander Springs,47.97,5956,13319.0,50.3%,
6,Rogers Trail junction,49.52,6698,13967.0,52.8%,
7,Rogers Saddle,49.77,6790,14073.0,53.2%,
8,Railroad Bed,50.15,6520,,,
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%,


## 插入列

In [0]:
# 直接加
df['bogus'] = pd.Series(range(11))
df

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN,bogus
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%,0
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%,1
2,Bald Mountain,42.46,7869,12593.0,47.6%,2
3,Pence Point,43.99,7521,12813.0,48.4%,3
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%,4
5,Alexander Springs,47.97,5956,13319.0,50.3%,5
6,Rogers Trail junction,49.52,6698,13967.0,52.8%,6
7,Rogers Saddle,49.77,6790,14073.0,53.2%,7
8,Railroad Bed,50.15,6520,,,8
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%,9


In [0]:
# 以函数形式加
def aid_station(val):
  return 'Station' in val

df['STATION']=df['LOCATION'].apply(aid_station)
df

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN,bogus,STATION
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%,0,True
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%,1,False
2,Bald Mountain,42.46,7869,12593.0,47.6%,2,False
3,Pence Point,43.99,7521,12813.0,48.4%,3,False
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%,4,True
5,Alexander Springs,47.97,5956,13319.0,50.3%,5,False
6,Rogers Trail junction,49.52,6698,13967.0,52.8%,6,False
7,Rogers Saddle,49.77,6790,14073.0,53.2%,7,False
8,Railroad Bed,50.15,6520,,,8,False
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%,9,True


# **三、update**

# **四、 Delete**

In [0]:
import pandas as pd
url = 'https://raw.githubusercontent.com/jackqk/pandas-note/master/data/sample1.csv'
df = pd.read_csv(url)
df

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%
2,Bald Mountain,42.46,7869,12593.0,47.6%
3,Pence Point,43.99,7521,12813.0,48.4%
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%
5,Alexander Springs,47.97,5956,13319.0,50.3%
6,Rogers Trail junction,49.52,6698,13967.0,52.8%
7,Rogers Saddle,49.77,6790,14073.0,53.2%
8,Railroad Bed,50.15,6520,,
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%


## delete rows（删除行）

In [0]:
# drop传入index
df.drop([5,9])

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%
2,Bald Mountain,42.46,7869,12593.0,47.6%
3,Pence Point,43.99,7521,12813.0,48.4%
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%
6,Rogers Trail junction,49.52,6698,13967.0,52.8%
7,Rogers Saddle,49.77,6790,14073.0,53.2%
8,Railroad Bed,50.15,6520,,


In [0]:
# print(df.index)
# df.drop(df.index[0:4:2])
df.drop(df.index[0:10:2])

Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%
3,Pence Point,43.99,7521,12813.0,48.4%
5,Alexander Springs,47.97,5956,13319.0,50.3%
7,Rogers Saddle,49.77,6790,14073.0,53.2%
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%


## delete column（删除列）

In [0]:
# 默认inplase=False
display(df.drop('MILES', axis=1))
print
df  #MILE在原始数据不变

Unnamed: 0,LOCATION,ELEVATION,CUMUL,%CUMUL GAIN
0,Big Mountain Pass AidStation,7432,11579.0,43.8%
1,Mules Ear Meadow,7478,12008.0,45.4%
2,Bald Mountain,7869,12593.0,47.6%
3,Pence Point,7521,12813.0,48.4%
4,Alexander Ridge Aid Station,6160,13169.0,49.8%
5,Alexander Springs,5956,13319.0,50.3%
6,Rogers Trail junction,6698,13967.0,52.8%
7,Rogers Saddle,6790,14073.0,53.2%
8,Railroad Bed,6520,,
9,Lambs Canyon Underpass Aid Station,6111,14329.0,54.2%


Unnamed: 0,LOCATION,MILES,ELEVATION,CUMUL,%CUMUL GAIN
0,Big Mountain Pass AidStation,39.07,7432,11579.0,43.8%
1,Mules Ear Meadow,40.75,7478,12008.0,45.4%
2,Bald Mountain,42.46,7869,12593.0,47.6%
3,Pence Point,43.99,7521,12813.0,48.4%
4,Alexander Ridge Aid Station,46.9,6160,13169.0,49.8%
5,Alexander Springs,47.97,5956,13319.0,50.3%
6,Rogers Trail junction,49.52,6698,13967.0,52.8%
7,Rogers Saddle,49.77,6790,14073.0,53.2%
8,Railroad Bed,50.15,6520,,
9,Lambs Canyon Underpass Aid Station,52.48,6111,14329.0,54.2%


In [0]:
# 原始数据直接干掉了
df.pop('LOCATION')
df

Unnamed: 0,MILES,ELEVATION,CUMUL,%CUMUL GAIN
0,39.07,7432,11579.0,43.8%
1,40.75,7478,12008.0,45.4%
2,42.46,7869,12593.0,47.6%
3,43.99,7521,12813.0,48.4%
4,46.9,6160,13169.0,49.8%
5,47.97,5956,13319.0,50.3%
6,49.52,6698,13967.0,52.8%
7,49.77,6790,14073.0,53.2%
8,50.15,6520,,
9,52.48,6111,14329.0,54.2%


In [0]:
# 原始数据直接干掉了
del df['CUMUL']
df

Unnamed: 0,MILES,ELEVATION,%CUMUL GAIN
0,39.07,7432,43.8%
1,40.75,7478,45.4%
2,42.46,7869,47.6%
3,43.99,7521,48.4%
4,46.9,6160,49.8%
5,47.97,5956,50.3%
6,49.52,6698,52.8%
7,49.77,6790,53.2%
8,50.15,6520,
9,52.48,6111,54.2%


# **Dealing With Missing Data**

In [1]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'Name':['Fred', 'Sally', 'George', 'Fido'],
           'Age':[22, 29, 24, None],
           'Color':['Red', 'Blue', np.nan, 'Black']})
df

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,
3,Fido,,Black


## Find Valid Data

In [6]:
valid = df.notnull()
#寻找某列不为空的所有值
df[valid['Age']]

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,


In [7]:
df[valid['Color']]

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
3,Fido,,Black


In [8]:
#这种情况与dropna结果一样。
df[valid['Age'] & valid['Color']]

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue


## Finding Missing Data

In [2]:
df.isnull()

Unnamed: 0,Name,Age,Color
0,False,False,False
1,False,False,False
2,False,False,True
3,False,True,False


In [3]:
# is there any column null.
# if answer is True，说明哪一列上有Missing Data
df.isnull().any()

Name     False
Age       True
Color     True
dtype: bool

## Dropping Missing Data

In [4]:
# drop row
df.dropna()

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue


## Inserting Data for Missing Data

In [9]:
df.fillna('missing')

Unnamed: 0,Name,Age,Color
0,Fred,22,Red
1,Sally,29,Blue
2,George,24,missing
3,Fido,missing,Black


In [10]:
df.fillna({'Age':df['Age'].median(),
      'Color':'Pink'})

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,Pink
3,Fido,24.0,Black


In [16]:
# 跟上一行一样, axis = 0
# 跟上一列一样， axis = 1
display( df.fillna(method='ffill') )
print()
display( df.fillna(method='ffill', axis=1) )

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,Blue
3,Fido,24.0,Black





Unnamed: 0,Name,Age,Color
0,Fred,22,Red
1,Sally,29,Blue
2,George,24,24
3,Fido,Fido,Black


In [19]:
# 跟下一行一样， axis = 0
# 跟下一列一样， axis = 1
display( df.fillna(method='bfill') )
print()
display( df.fillna(method='bfill', axis = 1) )

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,Black
3,Fido,,Black





Unnamed: 0,Name,Age,Color
0,Fred,22,Red
1,Sally,29,Blue
2,George,24,
3,Fido,Black,Black


In [21]:
# 这玩意不是很懂
df.interpolate()

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,
3,Fido,24.0,Black


In [22]:
# 替换，我会直接用fillna，可以指定规则
df.replace(np.nan, value=-1)

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,-1
3,Fido,-1.0,Black


In [34]:
df.fillna({'Age':-100,
      'Color':'Pink'})

Unnamed: 0,Name,Age,Color
0,Fred,22.0,Red
1,Sally,29.0,Blue
2,George,24.0,Pink
3,Fido,-100.0,Black
