In [1]:
import pandas as pd

# Series详解

## 创建Series

### 传入一个Python列表

In [2]:
s = pd.Series(['banana',42])
print(s)
print(type(s))

0    banana
1        42
dtype: object
<class 'pandas.core.series.Series'>


In [6]:
s = pd.Series([50,42])
print(s)
print(type(s))

0    50
1    42
dtype: int64
<class 'pandas.core.series.Series'>


In [7]:
s = pd.Series(['banana','apple'])
print(s)
print(type(s))

0    banana
1     apple
dtype: object
<class 'pandas.core.series.Series'>


### 注意：0，1 是Series的行标签，默认为0,1,2,3,4

### 创建series标签时，也可以通过index参数来指定行标签

In [9]:
s = pd.Series(['banana',42],index=['name','price'])
print(s)
print(type(s))

name     banana
price        42
dtype: object
<class 'pandas.core.series.Series'>


## Series的常用操作

In [9]:
scientists=pd.read_csv('./data(1)/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 查看形状

In [15]:
scientists.shape

(8, 5)

### 查看个数

In [17]:
scientists.size

40

### 获取行标签

In [19]:
scientists.index

RangeIndex(start=0, stop=8, step=1)

### 根据行标签获取列的数据

In [18]:
age_series=scientists['Age']
print(age_series)
print(type(age_series))

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


### 获取元素值

In [21]:
age_series.values

array([37, 61, 90, 66, 56, 45, 41, 77], dtype=int64)

### 获取数据行标签


In [22]:
age_series.keys()

RangeIndex(start=0, stop=8, step=1)

### 根据行标签获取某个元素数据

In [23]:
age_series.loc[3]

66

### 根据行位置获取某个元素数据

In [26]:
age_series.iloc[2]

90

### 查看数据元素的类型

In [28]:
age_series.dtypes

dtype('int64')

## 常用统计方法

In [29]:
# 平均值
age_series.mean()

59.125

In [30]:
# 最大值
age_series.max()

90

In [31]:
# 最小值
age_series.min()

37

In [33]:
# 标准差
age_series.std()

18.325918413937288

In [34]:
# 出现的次数
age_series.value_counts()

77    1
61    1
90    1
41    1
56    1
45    1
37    1
66    1
Name: Age, dtype: int64

In [35]:
# 获取职业这一列数据
occupation_series = scientists['Occupation']
print(occupation_series)
print(type(occupation_series))

0               Chemist
1          Statistician
2                 Nurse
3               Chemist
4             Biologist
5             Physician
6    Computer Scientist
7         Mathematician
Name: Occupation, dtype: object
<class 'pandas.core.series.Series'>


In [36]:
occupation_series.value_counts()

Chemist               2
Mathematician         1
Computer Scientist    1
Physician             1
Nurse                 1
Biologist             1
Statistician          1
Name: Occupation, dtype: int64

In [40]:
# 统计age这一列非空元素的个数
age_series.count()

8

In [39]:
# age_series 是数值型数据
age_series.describe()

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64

In [41]:
# occupation_series非数值数据
occupation_series.describe()

count           8
unique          7
top       Chemist
freq            2
Name: Occupation, dtype: object

## bool索引

In [42]:
#从series获取bool索引为true的位置对应的数据
bool_values =[False,True,True,True,False,False,False,True]
age_series[bool_values]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [43]:
#应用：从age_series中筛选出年龄大于平均值的数据
age_series>age_series.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [44]:
age_series[age_series>age_series.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

## series 运算

In [45]:
age_series + 100

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

In [46]:
age_series * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

### Series和另一种Series运算

In [47]:
age_series + age_series

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

### 常用创建方法

In [4]:
# 创建DataFrame的时候可以使用colums参数指定列的顺序，也可以使用index参数来指定行标签
peoples=pd.DataFrame({
    'Occupation':['Teacher','IT Engineer'],
    'Age':[18,30]
},columns=['Age','Occupation'],index=['Smart','David'])
peoples

Unnamed: 0,Age,Occupation
Smart,18,Teacher
David,30,IT Engineer


In [8]:
# 也可以使用嵌套列表创建DataFrame，并使用colums参数指定列标签，使用index参数来指定行标签
peoples=pd.DataFrame([
    ['Teacher',18],
    ['IT Engineer',30]
],columns=['Occupation','Age'],index=['Smart','David'])
peoples

Unnamed: 0,Occupation,Age
Smart,Teacher,18
David,IT Engineer,30


In [11]:
# 查看数据元素形状
scientists.shape

(8, 5)

In [12]:
# 查看DataFrame数据的总个数
scientists.size

40

In [13]:
# 获取DataFrame数据的维度
scientists.ndim

2

In [14]:
#获取DataFrame的行数
len(scientists)

8

In [15]:
# 获取DataFrame数据的行标签
scientists.index

RangeIndex(start=0, stop=8, step=1)

In [16]:
# 获取DataFrame数据的列标签
scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')

In [19]:
# 查看DataFrame每列数据元素的类型
scientists.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

In [18]:
# 查看DataFrame每列的结构
scientists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        8 non-null      object
 1   Born        8 non-null      object
 2   Died        8 non-null      object
 3   Age         8 non-null      int64 
 4   Occupation  8 non-null      object
dtypes: int64(1), object(4)
memory usage: 448.0+ bytes


In [21]:
# 获取DataFrame的前n行数据，默认为5
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [22]:
# 获取DataFrame的后n行数据，默认为5
scientists.tail(3)

Unnamed: 0,Name,Born,Died,Age,Occupation
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 常用统计方法

In [23]:
#计算DataFrame数据中每列元素的最大值
scientists.max()

Name          William Gosset
Born              1920-07-25
Died              1964-04-14
Age                       90
Occupation      Statistician
dtype: object

In [24]:
#计算DataFrame数据中每列元素的最小值
scientists.min()

Name          Alan Turing
Born           1777-04-30
Died           1855-02-23
Age                    37
Occupation      Biologist
dtype: object

In [25]:
# 统计DataFrame数据中每列非空元素的个数
scientists.count()

Name          8
Born          8
Died          8
Age           8
Occupation    8
dtype: int64

In [26]:
# 显示DataFrame数据中每列元素的各种统计值
# 默认只显示数值型列的统计信息，
#可以通过include参数设置非数值型的统计信息
scientists.describe()

Unnamed: 0,Age
count,8.0
mean,59.125
std,18.325918
min,37.0
25%,44.0
50%,58.5
75%,68.75
max,90.0


In [27]:
# 导入numpy jar包
import numpy as np

In [28]:
scientists.describe(include=[np.object_])

Unnamed: 0,Name,Born,Died,Occupation
count,8,8,8,8
unique,8,8,8,7
top,Alan Turing,1820-05-12,1958-04-16,Chemist
freq,1,1,1,2


## bool索引

### DataFrame支持bool索引，可以从DataFrame获取bool索引为True的对应行的数据

In [31]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [33]:
book_values=[False,True,True,True,False,False,False,True]
scientists[book_values]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [36]:
#应用：获取scientists中Age大于平均值的科学家的信息
scientists['Age']>scientists['Age'].mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [37]:
scientists[scientists['Age']>scientists['Age'].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


## DataFrame运算

In [38]:
# DataFrame 和数值型数据运算
scientists * 2

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


### DataFrame和另一个DataFrame运算

In [39]:
scientists + scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


In [40]:
scientists + scientists[:4]

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74.0,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122.0,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180.0,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132.0,ChemistChemist
4,,,,,
5,,,,,
6,,,,,
7,,,,,


## 行标签和列标签的操作

### 加载数据后，指定某列数据作为行标签

In [42]:
# 加载数据文件时，如果不指定行标签，Pandas会自动加上从0开始
scientists=pd.read_csv('./data(1)/scientists.csv')
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [44]:
# 设置Name列的值为行标签
scientists_df=scientists.set_index('Name')
scientists_df

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 设置行标签之后，可以通过reset_index方法充值行标签

In [46]:
scientists_df.reset_index()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


## 加载数据时，指定某列数列作为行标签

In [49]:
# 加载数据文件的时候，可以通过index_col参数，指定使用某一列数据作为行标签，Index_col参数可以指定列名或列位置
# 加载scientists.csv数据时，将Name列设置为行标签
pd.read_csv('./data(1)/scientists.csv',index_col='Name')

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [50]:
pd.read_csv('./data(1)/scientists.csv',index_col=0)

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 加载数据后，修改行标签和列标签

In [52]:
# 加载scientists.csv数据集
scientists=pd.read_csv('./data(1)/scientists.csv',index_col='Name')
scientists

Unnamed: 0_level_0,Born,Died,Age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
John Snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [54]:
#使用rename修改行标签和列标签
index_name={'Rosaline Franklin':'rosaline franklin','John Snow':'john snow'}
columns_name={'Born':'born','Age':'age'}
#注意：rename修改之后，返回的是一个新的DataFrame
scientists.rename(index=index_name,columns=columns_name)

Unnamed: 0_level_0,born,Died,age,Occupation
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rosaline franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
john snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


### 使用df.index 和 df.columns 分别修改行标签和列标签

In [59]:
# 修改行标签
scientists.index=['rosaline franklin','William Gosset','Florence Nightingale','Marie Curie','Rachel Carson','john snow','Alan Turing','Johann Gauss']
# 修改列标签    
scientists.columns=['born','Died','age','Occupation']
scientists

Unnamed: 0,born,Died,age,Occupation
rosaline franklin,1920-07-25,1958-04-16,37,Chemist
William Gosset,1876-06-13,1937-10-16,61,Statistician
Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
Marie Curie,1867-11-07,1934-07-04,66,Chemist
Rachel Carson,1907-05-27,1964-04-14,56,Biologist
john snow,1813-03-15,1858-06-16,45,Physician
Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
Johann Gauss,1777-04-30,1855-02-23,77,Mathematician
