# 基本数据对象及操作

## 1. Series

* 创建Series

In [1]:
import pandas as pd

countries = ['中国', '美国', '澳大利亚']
countries_s = pd.Series(countries)
print(type(countries_s))
print(countries_s)
print(countries_s.index)
print(type(countries_s.values))
print(countries_s.values)

<class 'pandas.core.series.Series'>
0      中国
1      美国
2    澳大利亚
dtype: object
RangeIndex(start=0, stop=3, step=1)
<class 'numpy.ndarray'>
['中国' '美国' '澳大利亚']


In [2]:
countries_s

0      中国
1      美国
2    澳大利亚
dtype: object

In [3]:
countries_s.head()

0      中国
1      美国
2    澳大利亚
dtype: object

In [4]:
countries_s.index

RangeIndex(start=0, stop=3, step=1)

In [5]:
countries = ['中国', '美国', '澳大利亚']
countries_s = pd.Series(countries, index=[1,2,3])
countries_s

1      中国
2      美国
3    澳大利亚
dtype: object

In [6]:
range_numbers_s = pd.Series(range(10))
range_numbers_s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [7]:
print(range_numbers_s[range_numbers_s > 6])

7    7
8    8
9    9
dtype: int64


In [8]:
range_numbers_s.values

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

In [9]:
range_numbers_s.index

RangeIndex(start=0, stop=10, step=1)

In [10]:
numbers = [4, 5, 6]
numbers_s = pd.Series(numbers)
print(numbers_s)
print(numbers_s.index)
print(numbers_s.values)

0    4
1    5
2    6
dtype: int64
RangeIndex(start=0, stop=3, step=1)
[4 5 6]


In [11]:
country_dicts = {'CH': '中国',
                'US': '美国',
                'AU': '澳大利亚'}
# dict会自动把key作为索引, 和

country_dict_s = pd.Series(country_dicts)
# 给索引命名
country_dict_s.index.name = 'Code'
# 给数据命名
country_dict_s.name = 'Country'

print(country_dict_s)
print(country_dict_s.values)
print(country_dict_s.index)

Code
CH      中国
US      美国
AU    澳大利亚
Name: Country, dtype: object
['中国' '美国' '澳大利亚']
Index(['CH', 'US', 'AU'], dtype='object', name='Code')


* 处理缺失数据

In [12]:
countries = ['中国', '美国', '澳大利亚', None]
print(pd.Series(countries))

0      中国
1      美国
2    澳大利亚
3    None
dtype: object


In [13]:
numbers = [4, 5, 6, None]
print(pd.Series(numbers))

0    4.0
1    5.0
2    6.0
3    NaN
dtype: float64


* Series 索引

In [14]:
country_dicts = {'CH': '中国',
                'US': '美国',
                'AU': '澳大利亚'}

country_dict_s = pd.Series(country_dicts)
print(country_dict_s)

CH      中国
US      美国
AU    澳大利亚
dtype: object


In [15]:
print(country_dict_s.index)
print(country_dict_s['US'])
print(country_dict_s.loc['US'])
print(country_dict_s.iloc[1])

Index(['CH', 'US', 'AU'], dtype='object')
美国
美国
美国


In [16]:
print('US' in country_dict_s)

True


In [17]:
country_dict_s = country_dict_s + 'a'
country_dict_s

CH      中国a
US      美国a
AU    澳大利亚a
dtype: object

In [18]:
len(country_dict_s)

3

In [19]:
# 通过索引判断数据是存在
# Series也可看作定长、有序的字典
print('CH' in country_dict_s)
print('NZ' in country_dict_s)

True
False


In [20]:
print('iloc:', country_dict_s.iloc[1])
print('loc:', country_dict_s.loc['US'])
print('[]:', country_dict_s['US'])

iloc: 美国a
loc: 美国a
[]: 美国a


In [21]:
print(country_dict_s.iloc[0:2])
print(country_dict_s.loc[ ['CH', 'US'] ])

CH    中国a
US    美国a
dtype: object
CH    中国a
US    美国a
dtype: object


In [22]:
print('iloc:\n', country_dict_s.iloc[ [0, 2] ])
print()
print('loc:\n', country_dict_s.loc[['US', 'AU']])

iloc:
 CH      中国a
AU    澳大利亚a
dtype: object

loc:
 US      美国a
AU    澳大利亚a
dtype: object


*  向量化操作

In [23]:
import numpy as np

s = pd.Series(np.random.randint(0, 1000, 10000))
print(s.head())
print(len(s))

0    137
1    624
2    773
3    333
4     56
dtype: int32
10000


In [24]:
%%timeit -n 100
total = 0
for item in s:
    total += item

490 µs ± 33.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
%%timeit -n 100
total = np.sum(s)

82 µs ± 12.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 1000))
for label, value in s.iteritems():
    s.loc[label] = value + 2

105 ms ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
s += 2

208 µs ± 81.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## 2. DataFrame

* 创建Dataframe

In [28]:
import pandas as pd

country1 = pd.Series({'Name': '中国',
                    'Language': 'Chinese',
                    'Area': '9.597M km2',
                     'Happiness Rank': 79})

country2 = pd.Series({'Name': '美国',
                    'Language': 'English (US)',
                    'Area': '9.834M km2',
                     'Happiness Rank': 14})

country3 = pd.Series({'Name': '澳大利亚',
                    'Language': 'English (AU)',
                    'Area': '7.692M km2',
                     'Happiness Rank': 9})

country4 = pd.Series({'Name': '澳大利亚',
                    'Language': 'English (AU)',
                    'Area': '7.692M km2',
                     'Happiness Rank': 9})

df = pd.DataFrame([country1, country2, country3], index=['CH', 'US', 'AU'])

In [29]:
# 注意在jupyter中使用print和不使用print的区别
print(df)
df

    Name      Language        Area  Happiness Rank
CH    中国       Chinese  9.597M km2              79
US    美国  English (US)  9.834M km2              14
AU  澳大利亚  English (AU)  7.692M km2               9


Unnamed: 0,Name,Language,Area,Happiness Rank
CH,中国,Chinese,9.597M km2,79
US,美国,English (US),9.834M km2,14
AU,澳大利亚,English (AU),7.692M km2,9


In [30]:
n = np.arange(0, 30, 2).reshape(5, 3)
n

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22],
       [24, 26, 28]])

In [31]:
n_df = pd.DataFrame(n, columns=['one', 'two', 'three'])
n_df

Unnamed: 0,one,two,three
0,0,2,4
1,6,8,10
2,12,14,16
3,18,20,22
4,24,26,28


In [32]:
#n_df.head()
#n_df.tail()
n_df.describe()

Unnamed: 0,one,two,three
count,5.0,5.0,5.0
mean,12.0,14.0,16.0
std,9.486833,9.486833,9.486833
min,0.0,2.0,4.0
25%,6.0,8.0,10.0
50%,12.0,14.0,16.0
75%,18.0,20.0,22.0
max,24.0,26.0,28.0


In [33]:
import pandas as pd
dt = {0: [9, 8, 7, 6], 1: [3, 2, 1, 0]}
a = pd.DataFrame(dt)
a

Unnamed: 0,0,1
0,9,3
1,8,2
2,7,1
3,6,0


In [34]:
list(a[1])

[3, 2, 1, 0]

In [35]:
print(type(df.loc['CH']))

<class 'pandas.core.series.Series'>


In [36]:
print(type(df['Name']))

<class 'pandas.core.series.Series'>


In [37]:
# 添加数据
# 如果个数小于要求的个数，会自动进行“广播”操作
# 如果大于要求的个数，会报错
df['Location'] = '地球'
print(df)

df['Region'] = ['亚洲', '北美洲', '大洋洲']
df['raw_index'] = [i for i in range(len(df))]
# print(df)
df

    Name      Language        Area  Happiness Rank Location
CH    中国       Chinese  9.597M km2              79       地球
US    美国  English (US)  9.834M km2              14       地球
AU  澳大利亚  English (AU)  7.692M km2               9       地球


Unnamed: 0,Name,Language,Area,Happiness Rank,Location,Region,raw_index
CH,中国,Chinese,9.597M km2,79,地球,亚洲,0
US,美国,English (US),9.834M km2,14,地球,北美洲,1
AU,澳大利亚,English (AU),7.692M km2,9,地球,大洋洲,2


* Dataframe索引

In [38]:
df.loc['CH']

Name                      中国
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Location                  地球
Region                    亚洲
raw_index                  0
Name: CH, dtype: object

In [39]:
df.iloc[0]

Name                      中国
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Location                  地球
Region                    亚洲
raw_index                  0
Name: CH, dtype: object

In [40]:
print(df['Happiness Rank'] < 10)
print(df[df['Happiness Rank'] < 10])

CH    False
US    False
AU     True
Name: Happiness Rank, dtype: bool
    Name      Language        Area  Happiness Rank Location Region  raw_index
AU  澳大利亚  English (AU)  7.692M km2               9       地球    大洋洲          2


In [41]:
print(df.iloc[1:3])
print(df[['Name', 'Area', 'Happiness Rank']])

    Name      Language        Area  Happiness Rank Location Region  raw_index
US    美国  English (US)  9.834M km2              14       地球    北美洲          1
AU  澳大利亚  English (AU)  7.692M km2               9       地球    大洋洲          2
    Name        Area  Happiness Rank
CH    中国  9.597M km2              79
US    美国  9.834M km2              14
AU  澳大利亚  7.692M km2               9


In [42]:
df['Location'] = 'Earth'
df

Unnamed: 0,Name,Language,Area,Happiness Rank,Location,Region,raw_index
CH,中国,Chinese,9.597M km2,79,Earth,亚洲,0
US,美国,English (US),9.834M km2,14,Earth,北美洲,1
AU,澳大利亚,English (AU),7.692M km2,9,Earth,大洋洲,2


In [43]:
df['Region'] = ['Asia', 'North America', 'Ocean']
df

Unnamed: 0,Name,Language,Area,Happiness Rank,Location,Region,raw_index
CH,中国,Chinese,9.597M km2,79,Earth,Asia,0
US,美国,English (US),9.834M km2,14,Earth,North America,1
AU,澳大利亚,English (AU),7.692M km2,9,Earth,Ocean,2


In [44]:
df.loc[df['Happiness Rank'] > 70, 'Happiness Rank'] = 70
df

Unnamed: 0,Name,Language,Area,Happiness Rank,Location,Region,raw_index
CH,中国,Chinese,9.597M km2,70,Earth,Asia,0
US,美国,English (US),9.834M km2,14,Earth,North America,1
AU,澳大利亚,English (AU),7.692M km2,9,Earth,Ocean,2


In [45]:
# 行索引
print('loc:')
print(df.loc['CH'])
print(type(df.loc['CH']))

print('iloc:')
print(df.iloc[1])

loc:
Name                      中国
Language             Chinese
Area              9.597M km2
Happiness Rank            70
Location               Earth
Region                  Asia
raw_index                  0
Name: CH, dtype: object
<class 'pandas.core.series.Series'>
iloc:
Name                         美国
Language           English (US)
Area                 9.834M km2
Happiness Rank               14
Location                  Earth
Region            North America
raw_index                     1
Name: US, dtype: object


In [46]:
# 列索引
print(df['Area'])
print(type(df['Area']))

CH    9.597M km2
US    9.834M km2
AU    7.692M km2
Name: Area, dtype: object
<class 'pandas.core.series.Series'>


In [47]:
# 获取不连续的列数据
print(df[['Name', 'Area']])

    Name        Area
CH    中国  9.597M km2
US    美国  9.834M km2
AU  澳大利亚  7.692M km2


In [48]:
# 混合索引
# 注意写法上的区别
# 不管是先行还是先列, 拿出之后都是一个Series, 之后的访问就可以多种多样了
print('先取出列，再取行：')
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])

print('先取出行，再取列：')
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])
print(df.loc['CH'].loc['Area'])
print(df.loc['CH'].iloc[0])

先取出列，再取行：
9.597M km2
9.597M km2
9.597M km2
先取出行，再取列：
9.597M km2
9.597M km2
9.597M km2
中国


In [49]:
# 转换行和列
print(df.T)

                        CH             US            AU
Name                    中国             美国          澳大利亚
Language           Chinese   English (US)  English (AU)
Area            9.597M km2     9.834M km2    7.692M km2
Happiness Rank          70             14             9
Location             Earth          Earth         Earth
Region                Asia  North America         Ocean
raw_index                0              1             2


* 删除数据

In [52]:
print(df.drop(['CH']))
# 注意drop操作只是将修改后的数据copy一份，而不会对原始数据进行修改
print(df)
df = df.drop(['CH'])

KeyError: "labels ['CH'] not contained in axis"

In [54]:
print(df.drop(['CH'], inplace=True))
# 如果使用了inplace=True，会在原始数据上进行修改，同时不会返回一个copy
print(df)

KeyError: "labels ['CH'] not contained in axis"

In [None]:
#  如果需要删除列，需要指定axis=1
print(df.drop(['Area'], axis=1))
print(df)

In [None]:
# 也可直接使用del关键字
del df['Name']
print(df)

* DataFrame的操作与加载

In [56]:
df['Happiness Rank']

US    14
AU     9
Name: Happiness Rank, dtype: int64

In [57]:
# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
ranks = df['Happiness Rank']
ranks += 2
print(ranks)
print(df)

US    16
AU    11
Name: Happiness Rank, dtype: int64
    Name      Language        Area  Happiness Rank Location         Region  \
US    美国  English (US)  9.834M km2              16    Earth  North America   
AU  澳大利亚  English (AU)  7.692M km2              11    Earth          Ocean   

    raw_index  
US          1  
AU          2  


In [58]:
# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
# 安全的操作是使用copy()
ranks = df['Happiness Rank'].copy()
ranks += 2
print(ranks)
print(df)

US    18
AU    13
Name: Happiness Rank, dtype: int64
    Name      Language        Area  Happiness Rank Location         Region  \
US    美国  English (US)  9.834M km2              16    Earth  North America   
AU  澳大利亚  English (AU)  7.692M km2              11    Earth          Ocean   

    raw_index  
US          1  
AU          2  


In [59]:
# 加载csv文件数据
reprot_2015_df = pd.read_csv('./2015.csv')
print('2015年数据预览：')
#print(reprot_2015_df.head())
reprot_2015_df.head()

2015年数据预览：


Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [60]:
reprot_2015_df = pd.read_csv('./2015.csv', index_col='Happiness Rank')
print('2015年数据预览：')
reprot_2015_df.head(10)

2015年数据预览：


Unnamed: 0_level_0,Country,Region,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
Happiness Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Switzerland,Western Europe,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
2,Iceland,Western Europe,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
3,Denmark,Western Europe,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
4,Norway,Western Europe,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
5,Canada,North America,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
6,Finland,Western Europe,7.406,0.0314,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2.61955
7,Netherlands,Western Europe,7.378,0.02799,1.32944,1.28017,0.89284,0.61576,0.31814,0.4761,2.4657
8,Sweden,Western Europe,7.364,0.03157,1.33171,1.28907,0.91087,0.6598,0.43844,0.36262,2.37119
9,New Zealand,Australia and New Zealand,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425
10,Australia,Australia and New Zealand,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646


In [61]:
print(reprot_2015_df.info())
reprot_2015_df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158 entries, 1 to 158
Data columns (total 11 columns):
Country                          158 non-null object
Region                           158 non-null object
Happiness Score                  158 non-null float64
Standard Error                   158 non-null float64
Economy (GDP per Capita)         158 non-null float64
Family                           158 non-null float64
Health (Life Expectancy)         158 non-null float64
Freedom                          158 non-null float64
Trust (Government Corruption)    158 non-null float64
Generosity                       158 non-null float64
Dystopia Residual                158 non-null float64
dtypes: float64(9), object(2)
memory usage: 14.8+ KB
None


Unnamed: 0,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,5.375734,0.047885,0.846137,0.991046,0.630259,0.428615,0.143422,0.237296,2.098977
std,1.14501,0.017146,0.403121,0.272369,0.247078,0.150693,0.120034,0.126685,0.55355
min,2.839,0.01848,0.0,0.0,0.0,0.0,0.0,0.0,0.32858
25%,4.526,0.037268,0.545808,0.856823,0.439185,0.32833,0.061675,0.150553,1.75941
50%,5.2325,0.04394,0.910245,1.02951,0.696705,0.435515,0.10722,0.21613,2.095415
75%,6.24375,0.0523,1.158448,1.214405,0.811013,0.549092,0.180255,0.309883,2.462415
max,7.587,0.13693,1.69042,1.40223,1.02525,0.66973,0.55191,0.79588,3.60214


## 3. 索引

In [67]:
# 使用index_col指定索引列
# 使用usecols指定需要读取的列
report_2016_df = pd.read_csv('./2016.csv', 
                             index_col='Country',
                             usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region'])
reprot_2016_df = pd.read_csv('./2016.csv', 
                             index_col='Country',
                             usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region'])
# 数据预览
report_2016_df.head()

Unnamed: 0_level_0,Region,Happiness Rank,Happiness Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Denmark,Western Europe,1,7.526
Switzerland,Western Europe,2,7.509
Iceland,Western Europe,3,7.501
Norway,Western Europe,4,7.498
Finland,Western Europe,5,7.413


In [68]:
print('列名(column)：', report_2016_df.columns)
print('行名(index)：', report_2016_df.index)

列名(column)： Index(['Region', 'Happiness Rank', 'Happiness Score'], dtype='object')
行名(index)： Index(['Denmark', 'Switzerland', 'Iceland', 'Norway', 'Finland', 'Canada',
       'Netherlands', 'New Zealand', 'Australia', 'Sweden',
       ...
       'Madagascar', 'Tanzania', 'Liberia', 'Guinea', 'Rwanda', 'Benin',
       'Afghanistan', 'Togo', 'Syria', 'Burundi'],
      dtype='object', name='Country', length=157)


In [69]:
# 注意index是不可变的
#report_2016_df.index[0] = '丹麦'

In [70]:
# 重置index
# 注意inplace加与不加的区别
reprot_2016_df.reset_index(inplace=True)

In [71]:
report_2016_df.head()

Unnamed: 0_level_0,Region,Happiness Rank,Happiness Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Denmark,Western Europe,1,7.526
Switzerland,Western Europe,2,7.509
Iceland,Western Europe,3,7.501
Norway,Western Europe,4,7.498
Finland,Western Europe,5,7.413


In [72]:
print(report_2016_df.index)

Index(['Denmark', 'Switzerland', 'Iceland', 'Norway', 'Finland', 'Canada',
       'Netherlands', 'New Zealand', 'Australia', 'Sweden',
       ...
       'Madagascar', 'Tanzania', 'Liberia', 'Guinea', 'Rwanda', 'Benin',
       'Afghanistan', 'Togo', 'Syria', 'Burundi'],
      dtype='object', name='Country', length=157)


In [73]:
# 重命名列名
report_2016_df = report_2016_df.rename(columns={'Region': '地区', 'Hapiness Rank': '排名', 'Hapiness Score': '幸福指数'})
report_2016_df.head()

Unnamed: 0_level_0,地区,Happiness Rank,Happiness Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Denmark,Western Europe,1,7.526
Switzerland,Western Europe,2,7.509
Iceland,Western Europe,3,7.501
Norway,Western Europe,4,7.498
Finland,Western Europe,5,7.413


In [74]:
# 重命名列名，注意inplace的使用
report_2016_df.rename(columns={'Region': '地区', 'Happiness Rank': '排名', 'Happiness Score': '幸福指数'},
                     inplace=True)
report_2016_df.head()

Unnamed: 0_level_0,地区,排名,幸福指数
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Denmark,Western Europe,1,7.526
Switzerland,Western Europe,2,7.509
Iceland,Western Europe,3,7.501
Norway,Western Europe,4,7.498
Finland,Western Europe,5,7.413


## 4. Boolean Mask

In [75]:
report_2016_df.head(20)

Unnamed: 0_level_0,地区,排名,幸福指数
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Denmark,Western Europe,1,7.526
Switzerland,Western Europe,2,7.509
Iceland,Western Europe,3,7.501
Norway,Western Europe,4,7.498
Finland,Western Europe,5,7.413
Canada,North America,6,7.404
Netherlands,Western Europe,7,7.339
New Zealand,Australia and New Zealand,8,7.334
Australia,Australia and New Zealand,9,7.313
Sweden,Western Europe,10,7.291


In [76]:
report_2016_df.tail(20)

Unnamed: 0_level_0,地区,排名,幸福指数
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Comoros,Sub-Saharan Africa,138,3.956
Ivory Coast,Sub-Saharan Africa,139,3.916
Cambodia,Southeastern Asia,140,3.907
Angola,Sub-Saharan Africa,141,3.866
Niger,Sub-Saharan Africa,142,3.856
South Sudan,Sub-Saharan Africa,143,3.832
Chad,Sub-Saharan Africa,144,3.763
Burkina Faso,Sub-Saharan Africa,145,3.739
Uganda,Sub-Saharan Africa,145,3.739
Yemen,Middle East and Northern Africa,147,3.724


In [77]:
# 过滤 Western Europe 地区的国家
# only_western_europe = reprot_2016_df['地区'] == 'Western Europe'
reprot_2016_df[reprot_2016_df['地区'] == 'Western Europe']

KeyError: '地区'

In [78]:
# 过滤 Western Europe 地区的国家
# 并且排名在10之外
only_western_europe_10 = (reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)
only_western_europe_10

KeyError: '地区'

In [None]:
# 叠加 boolean mask 得到最终结果
reprot_2016_df[only_western_europe_10]

In [79]:
# 熟练以后可以写在一行中
np.mean(reprot_2016_df[(reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)]['幸福指数'])

KeyError: '地区'

## 5. 层级索引

In [81]:
reprot_2015_df_mi = reprot_2015_df.set_index(['Region', 'Country'])
reprot_2015_df_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
Region,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Western Europe,Switzerland,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
Western Europe,Iceland,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
Western Europe,Denmark,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
Western Europe,Norway,7.522,0.03880,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
North America,Canada,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
Western Europe,Finland,7.406,0.03140,1.29025,1.31826,0.88911,0.64169,0.41372,0.23351,2.61955
Western Europe,Netherlands,7.378,0.02799,1.32944,1.28017,0.89284,0.61576,0.31814,0.47610,2.46570
Western Europe,Sweden,7.364,0.03157,1.33171,1.28907,0.91087,0.65980,0.43844,0.36262,2.37119
Australia and New Zealand,New Zealand,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425
Australia and New Zealand,Australia,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646


In [82]:
reprot_2015_df_mi.index

MultiIndex(levels=[['Australia and New Zealand', 'Central and Eastern Europe', 'Eastern Asia', 'Latin America and Caribbean', 'Middle East and Northern Africa', 'North America', 'Southeastern Asia', 'Southern Asia', 'Sub-Saharan Africa', 'Western Europe'], ['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Haiti', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 

In [83]:
reprot_2015_df_mi = reprot_2015_df_mi.sort_index(level=0)

In [84]:
reprot_2015_df_mi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
Region,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Australia and New Zealand,Australia,7.284,0.04083,1.33358,1.30923,0.93156,0.65124,0.35637,0.43562,2.26646
Australia and New Zealand,New Zealand,7.286,0.03371,1.25018,1.31967,0.90837,0.63938,0.42922,0.47501,2.26425
Central and Eastern Europe,Albania,4.959,0.05013,0.87867,0.80434,0.81325,0.35733,0.06413,0.14272,1.89894
Central and Eastern Europe,Armenia,4.35,0.04763,0.76821,0.77711,0.7299,0.19847,0.039,0.07855,1.75873
Central and Eastern Europe,Azerbaijan,5.212,0.03363,1.02389,0.93793,0.64045,0.3703,0.16065,0.07799,2.00073


In [87]:
reprot_2015_df_mi.index

MultiIndex(levels=[['Australia and New Zealand', 'Central and Eastern Europe', 'Eastern Asia', 'Latin America and Caribbean', 'Middle East and Northern Africa', 'North America', 'Southeastern Asia', 'Southern Asia', 'Sub-Saharan Africa', 'Western Europe'], ['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Ethiopia', 'Finland', 'France', 'Gabon', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Haiti', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 

In [94]:
%%timeit -n 1

print(np.mean(reprot_2015_df_mi.loc['Central and Eastern Europe']['Happiness Score']))
print(np.mean(reprot_2015_df_mi.loc['Eastern Asia']['Happiness Score']))

5.332931034482758
5.626166666666666
5.332931034482758
5.626166666666666
5.332931034482758
5.626166666666666
5.332931034482758
5.626166666666666
5.332931034482758
5.626166666666666
5.332931034482758
5.626166666666666
5.332931034482758
5.626166666666666
1.17 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [106]:
%%timeit -n 1

reprot_2015_df_2 = pd.read_csv('./2015.csv')
np.mean(reprot_2015_df_2.loc[reprot_2015_df_2['Region'] == 'Central and Eastern Europe', 'Happiness Score'])

2.58 ms ± 292 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [89]:
reprot_2015_df_mi.loc['Eastern Asia', 'China']

Happiness Score                  5.14000
Standard Error                   0.02424
Economy (GDP per Capita)         0.89012
Family                           0.94675
Health (Life Expectancy)         0.81658
Freedom                          0.51697
Trust (Government Corruption)    0.02781
Generosity                       0.08185
Dystopia Residual                1.86040
Name: (Eastern Asia, China), dtype: float64

In [90]:
reprot_2015_df.head()

Unnamed: 0_level_0,Country,Region,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
Happiness Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Switzerland,Western Europe,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
2,Iceland,Western Europe,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
3,Denmark,Western Europe,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
4,Norway,Western Europe,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
5,Canada,North America,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [91]:
# 设置层级索引
report_2015_df2 = report_2015_df.set_index(['Region', 'Country'])
report_2015_df2.head(20)

NameError: name 'report_2015_df' is not defined

In [None]:
# level0 索引
report_2015_df2.loc['Western Europe']

In [None]:
# 两层索引
report_2015_df2.loc['Western Europe', 'Switzerland']

In [None]:
# 交换分层顺序
report_2015_df2.swaplevel()

In [None]:
# 排序分层
report_2015_df2.sort_index(level=0)