# Pandas安装

In [1]:
# pandas安装
# pip show pandas

In [2]:
# 查看版本
import pandas as pd

pd.__version__

'1.5.3'

In [3]:
# 实例
mydataset = {"sites": ["a", "b", "c"], "nums": [1, 2, 3]}
myvar = pd.DataFrame(mydataset)
print(myvar)

  sites  nums
0     a     1
1     b     2
2     c     3


# Pandas数据结构-Series

Pandas Series类似变革中的一个列（column），类似于一维数组可以保存任何数据类型。

Series由索引（index)和列组成

参数说明：

+ data: 一组数据（ndarray）类型
+ index: 数据索引标签，如果不指定，默认从0开始
+ dtype: 数据类型，默认会自己判断
+ name: 设置名称
+ copy: 拷贝数据，默认为False


In [4]:
# 下面的代码可以看到第一列为索引，默认为从0开始；第二列为数据；下面是数据类型
a = [1, 2, 3]
myvar = pd.Series(a)
print(myvar)
print(myvar[1])

0    1
1    2
2    3
dtype: int64
2


In [5]:
# 指定索引
a = [1, 2, 3]
b = ["a", "b", "c"]
myvar = pd.Series(a, index=b)
print(myvar)
print(myvar["c"])

a    1
b    2
c    3
dtype: int64
3


In [6]:
# 使用key/value，类似字典创建

dict1 = {1: "Q", 2: "W", 3: "E"}
myvar = pd.Series(dict1)
print(myvar)

# 输出指定数据
myvar = pd.Series(dict1, index=[1, 3])
print(myvar)

1    Q
2    W
3    E
dtype: object
1    Q
3    E
dtype: object


# Pandas数据结构-DataFrame

DataFrame是一个表格型的数据结构，它含有一组有序的数据，每列可以是不同的值类型（数值，字符串，布尔型值）。DateFrame既有行索引也有列索引，它可以被看作由Series组成的序列（共用同一个索引）

[![image.png](https://i.postimg.cc/jdMkDmyv/image.png)](https://postimg.cc/QHWqP64W)

DateFrame构造方法如下：

pandas.DateFrame(data,index,colums,dtype,copy)

参数说明：

+ data: 一组数据（ndarray）类型
+ index: 索引值，可以称为行标签
+ columns: 列标签，默认为RangeIndex(0,1,2...)
+ dtype: 数据类型，默认会自己判断
+ copy: 拷贝数据，默认为False

PandasDataFrame是一个二维的数组结构，类似二维数组。


In [7]:
# 使用列表创建
import pandas as pd

data = [["Google", 10], ["Runoob", 12], ["Wiki", 30]]
df = pd.DataFrame(data, columns=["Site", "Age"], dtype=float)
print(df)

     Site   Age
0  Google  10.0
1  Runoob  12.0
2    Wiki  30.0


  df = pd.DataFrame(data, columns=["Site", "Age"], dtype=float)


In [8]:
# 使用ndarrays创建
data = {"site": ["Google", "Runboo", "Wiki"], "Age": [12.3, 23.1, 34.2]}
df = pd.DataFrame(data)
print(df)

     site   Age
0  Google  12.3
1  Runboo  23.1
2    Wiki  34.2


In [9]:
# 使用字典创建(没有对应的数据位Nan)
data = [{"a": 1, "b": 2}, {"a": 11, "b": 12, "c": 13}]
df = pd.DataFrame(data)
print(df)

    a   b     c
0   1   2   NaN
1  11  12  13.0


In [10]:
# 返回指定行的数据
data = {"calories": [12, 23, 45, 67], "duration": ["q", "w", "e", "r"]}

df = pd.DataFrame(data)
print(df)

   calories duration
0        12        q
1        23        w
2        45        e
3        67        r


In [11]:
# 返回结果其实就是一个Pandas Series数据
print(df.loc[0])

calories    12
duration     q
Name: 0, dtype: object


In [12]:
print(df.loc[2])

calories    45
duration     e
Name: 2, dtype: object


In [13]:
# 也可以返回多行数据
print(df.loc[[0, 2]])

   calories duration
0        12        q
2        45        e


In [14]:
# 指定索引指
df = pd.DataFrame(data, index=["day1", "day2", "day3", "day4"])
print(df)

      calories duration
day1        12        q
day2        23        w
day3        45        e
day4        67        r


In [15]:
# 返回指定索引对应的某一行
print(df.loc["day3"])

calories    45
duration     e
Name: day3, dtype: object


# Pandas CSV文件

CSV(Comma-Separated Values，逗号分割值，有时也称字符分割值，因为分割字符也可以不是逗号)，其文件以纯文本形式存储表格数据（数字和文本）。

CSV是一种通用的、相对简单地文件格式、被用户、商业和科学广泛应用

In [16]:
import pandas as pd

df = pd.read_csv("./file/nba.csv")
print(df.to_string())

                         Name                    Team  Number Position   Age Height  Weight                College      Salary
0               Avery Bradley          Boston Celtics     0.0       PG  25.0    6-2   180.0                  Texas   7730337.0
1                 Jae Crowder          Boston Celtics    99.0       SF  25.0    6-6   235.0              Marquette   6796117.0
2                John Holland          Boston Celtics    30.0       SG  27.0    6-5   205.0      Boston University         NaN
3                 R.J. Hunter          Boston Celtics    28.0       SG  22.0    6-5   185.0          Georgia State   1148640.0
4               Jonas Jerebko          Boston Celtics     8.0       PF  29.0   6-10   231.0                    NaN   5000000.0
5                Amir Johnson          Boston Celtics    90.0       PF  29.0    6-9   240.0                    NaN  12000000.0
6               Jordan Mickey          Boston Celtics    55.0       PF  21.0    6-8   235.0                    

## to_csv()方法将DataFrame存储为csv文件

In [17]:
name = ["Google", "Baidu", "Bing"]
site = ["www.google.com", "www.baidu.com", "www.bing.com"]
age = [12, 34, 56]

# 字典
dict1 = {"name": name, "site": site, "age": age}

df = pd.DataFrame(dict1)
print(df)

# 保存dataframe
df.to_csv("./file/site.csv")

     name            site  age
0  Google  www.google.com   12
1   Baidu   www.baidu.com   34
2    Bing    www.bing.com   56


## 数据处理

### head()

head(n)方法用于读取前面的n行，如果不填参数n，默认返回5行

In [18]:
import pandas as pd

df = pd.read_csv("./file/nba.csv")
print(df.head())

            Name            Team  Number Position   Age Height  Weight  \
0  Avery Bradley  Boston Celtics     0.0       PG  25.0    6-2   180.0   
1    Jae Crowder  Boston Celtics    99.0       SF  25.0    6-6   235.0   
2   John Holland  Boston Celtics    30.0       SG  27.0    6-5   205.0   
3    R.J. Hunter  Boston Celtics    28.0       SG  22.0    6-5   185.0   
4  Jonas Jerebko  Boston Celtics     8.0       PF  29.0   6-10   231.0   

             College     Salary  
0              Texas  7730337.0  
1          Marquette  6796117.0  
2  Boston University        NaN  
3      Georgia State  1148640.0  
4                NaN  5000000.0  


In [19]:
print(df.head(10))

            Name            Team  Number Position   Age Height  Weight  \
0  Avery Bradley  Boston Celtics     0.0       PG  25.0    6-2   180.0   
1    Jae Crowder  Boston Celtics    99.0       SF  25.0    6-6   235.0   
2   John Holland  Boston Celtics    30.0       SG  27.0    6-5   205.0   
3    R.J. Hunter  Boston Celtics    28.0       SG  22.0    6-5   185.0   
4  Jonas Jerebko  Boston Celtics     8.0       PF  29.0   6-10   231.0   
5   Amir Johnson  Boston Celtics    90.0       PF  29.0    6-9   240.0   
6  Jordan Mickey  Boston Celtics    55.0       PF  21.0    6-8   235.0   
7   Kelly Olynyk  Boston Celtics    41.0        C  25.0    7-0   238.0   
8   Terry Rozier  Boston Celtics    12.0       PG  22.0    6-2   190.0   
9   Marcus Smart  Boston Celtics    36.0       PG  22.0    6-4   220.0   

             College      Salary  
0              Texas   7730337.0  
1          Marquette   6796117.0  
2  Boston University         NaN  
3      Georgia State   1148640.0  
4         

### tail()

tail(n) 读取尾部n行，如果不填，默认读取尾部5行，空行各个字段的值默认返回NaN

In [20]:
print(df.tail())

             Name       Team  Number Position   Age Height  Weight College  \
453  Shelvin Mack  Utah Jazz     8.0       PG  26.0    6-3   203.0  Butler   
454     Raul Neto  Utah Jazz    25.0       PG  24.0    6-1   179.0     NaN   
455  Tibor Pleiss  Utah Jazz    21.0        C  26.0    7-3   256.0     NaN   
456   Jeff Withey  Utah Jazz    24.0        C  26.0    7-0   231.0  Kansas   
457           NaN        NaN     NaN      NaN   NaN    NaN     NaN     NaN   

        Salary  
453  2433333.0  
454   900000.0  
455  2900000.0  
456   947276.0  
457        NaN  


In [21]:
print(df.tail(7))

              Name       Team  Number Position   Age Height  Weight   College  \
451  Chris Johnson  Utah Jazz    23.0       SF  26.0    6-6   206.0    Dayton   
452     Trey Lyles  Utah Jazz    41.0       PF  20.0   6-10   234.0  Kentucky   
453   Shelvin Mack  Utah Jazz     8.0       PG  26.0    6-3   203.0    Butler   
454      Raul Neto  Utah Jazz    25.0       PG  24.0    6-1   179.0       NaN   
455   Tibor Pleiss  Utah Jazz    21.0        C  26.0    7-3   256.0       NaN   
456    Jeff Withey  Utah Jazz    24.0        C  26.0    7-0   231.0    Kansas   
457            NaN        NaN     NaN      NaN   NaN    NaN     NaN       NaN   

        Salary  
451   981348.0  
452  2239800.0  
453  2433333.0  
454   900000.0  
455  2900000.0  
456   947276.0  
457        NaN  


### info()

info()返回表格的一些基本信息

In [22]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB
None


# Pandas Json

JSON(JavaScript Object Notation,JavaScript对象表示法)，是存储和交换文本信息的语法，类型XML。

In [23]:
import pandas as pd

df = pd.read_json("./file/sites.json")

# to_string()用于返回DataFrame类型的数据
print(df.to_string())

     id    name             url  likes
0  A001    菜鸟教程  www.runoob.com     61
1  A002  Google  www.google.com    124
2  A003      淘宝  www.taobao.com     45


In [24]:
# to_string()用于返回DataFrame类型的数据，也可以直接处理json类型的数据

data = [
    {"id": "A001", "name": "菜鸟教程", "url": "www.runoob.com", "likes": 61},
    {"id": "A002", "name": "Google", "url": "www.google.com", "likes": 124},
    {"id": "A003", "name": "淘宝", "url": "www.taobao.com", "likes": 45},
]
df = pd.DataFrame(data)
print(df)

     id    name             url  likes
0  A001    菜鸟教程  www.runoob.com     61
1  A002  Google  www.google.com    124
2  A003      淘宝  www.taobao.com     45


In [25]:
# JSON对象与Python字典具有相同的格式，可以直接将Python字典转化为DataFrame数据
s = {
    "col1": {"row1": 1, "row2": 2, "row3": 3},
    "col2": {"row1": "x", "row2": "y", "row33": "z"},
}

# 读取JSON转化为DataFrame
print(pd.DataFrame(s))

       col1 col2
row1    1.0    x
row2    2.0    y
row3    3.0  NaN
row33   NaN    z


In [26]:
# 从URL中读取JSON数据
url = "https://static.runoob.com/download/sites.json"
print(pd.read_json(url))

     id    name             url  likes
0  A001    菜鸟教程  www.runoob.com     61
1  A002  Google  www.google.com    124
2  A003      淘宝  www.taobao.com     45


## 内嵌的JSON数据

In [27]:
df = pd.read_json("./file/nested_list.json")
print(df)

          school_name   class  \
0  ABC primary school  Year 1   
1  ABC primary school  Year 1   
2  ABC primary school  Year 1   

                                            students  
0  {'id': 'A001', 'name': 'Tom', 'math': 60, 'phy...  
1  {'id': 'A002', 'name': 'James', 'math': 89, 'p...  
2  {'id': 'A003', 'name': 'Jenny', 'math': 79, 'p...  


In [28]:
# 使用json_normalize()方法将内嵌的数据完整的解析处理
import json

with open("./file/nested_list.json") as f:
    data = json.loads(f.read())

# 展平数据
df = pd.json_normalize(data, record_path=["students"])
print(df)

# data = json.loads(f.read())使用Python JSON模块载入数据
# json_normalize()使用了参数recoard_path并设置为['students']用于展开内嵌的JSON数据students,

     id   name  math  physics  chemistry
0  A001    Tom    60       66         61
1  A002  James    89       76         51
2  A003  Jenny    79       90         78


In [29]:
# # json_normalize()使用了参数recoard_path并设置为['students']用于展开内嵌的JSON数据students，显示结果还没有包含school_name和class元素，如果需要展示出来，可以使用meta参数来显示这些元数据

df = pd.json_normalize(data, record_path=["students"], meta=["school_name", "class"])
print(df)

     id   name  math  physics  chemistry         school_name   class
0  A001    Tom    60       66         61  ABC primary school  Year 1
1  A002  James    89       76         51  ABC primary school  Year 1
2  A003  Jenny    79       90         78  ABC primary school  Year 1


## 复杂JSON数据

In [30]:
import json

import pandas as pd

# 使用Python Json模块载入数据
with open("./file/nested_mix.json", "r") as f:
    data = json.loads(f.read())

df = pd.json_normalize(
    data,
    record_path=["students"],
    meta=["class", ["info", "president"], ["info", "contacts", "tel"]],
)

print(df)

     id   name  math  physics  chemistry   class info.president  \
0  A001    Tom    60       66         61  Year 1    John Kasich   
1  A002  James    89       76         51  Year 1    John Kasich   
2  A003  Jenny    79       90         78  Year 1    John Kasich   

  info.contacts.tel  
0         123456789  
1         123456789  
2         123456789  


In [31]:
# 读取内嵌数据中的一组数据
# 使用glom模块来处理数据嵌套，glom模块允许我们使用 . 来访问内嵌对象的属性（使用pip install glom安装）

In [32]:
import pandas as pd
from glom import glom

df = pd.read_json("./file/nested_deep.json")
data = df["students"].apply(lambda row: glom(row, "grade.math"))
print(data)

0    60
1    89
2    79
Name: students, dtype: int64


In [33]:
with open("./file/nested_deep.json") as f:
    data = json.loads(f.read())
print(pd.json_normalize(data, record_path=["students"], meta=["school_name", "class"]))

     id   name  grade.math  grade.physics  grade.chemistry  \
0  A001    Tom          60             66               61   
1  A002  James          89             76               51   
2  A003  Jenny          79             90               78   

            school_name   class  
0  local primary school  Year 1  
1  local primary school  Year 1  
2  local primary school  Year 1  


# Pandas数据清洗

数据清洗是针对一些没用的数据进行处理的过程，很多数据集存在数据缺失、格式错误、错误数据或重复数据的情况，如果要使数据分析更加准确，就需要处理

常见的空数据有：

+ n/a
+ NA
+ -
+ na

## Pandas清洗空值

如果要删除包含空字段的行，可以使用dropna()方法

DataFrame.dropna(axix=0,how='any',thresh=None,subset=None,inplace=False)

+ axis：默认为0，表示逢空值剔除整行，如果axis=1表示逢空值去掉整列
+ how：默认为'any'，如果一行（或一列）里任何一个数据有出现NA就去掉整行，如果设置how='all'一行（或一列）都是NA才去掉这整行（整列）
+ thresh：设置需要多少非空值的数据才可以保留下来
+ subset：设置想要检查的列，如果是多个列，可以使用列名的list作为参数
+ inplace：如果设置为True，将计算得到的值直接覆盖之前的值并返回为None，修改的是源数据

In [34]:
# 通过isnull()判断各个单元格是否为空
import pandas as pd

df = pd.read_csv("./file/property-data.csv")

print(df.to_string())

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5    --
2  100003000.0     NaN   LEXINGTON            N          NaN        1   850
3  100004000.0   201.0    BERKELEY           12            1      NaN   700
4          NaN   203.0    BERKELEY            Y            3        2  1600
5  100006000.0   207.0    BERKELEY            Y          NaN        1   800
6  100007000.0     NaN  WASHINGTON          NaN            2   HURLEY   950
7  100008000.0   213.0     TREMONT            Y            1        1   NaN
8  100009000.0   215.0     TREMONT            Y           na        2  1800


In [35]:
print(df["NUM_BEDROOMS"])

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8     na
Name: NUM_BEDROOMS, dtype: object


In [36]:
print(df["NUM_BEDROOMS"].isnull())

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8    False
Name: NUM_BEDROOMS, dtype: bool


In [37]:
# 以上例子中我们看到Pandas把n/a和NA当做空数据，na不是空数据，不符合我们的要求，可以自定义指定空数据类型：

missing_values = ["n/a", "na", "--"]
df = pd.read_csv("./file/property-data.csv", na_values=missing_values)
print(df["NUM_BEDROOMS"])

0    3.0
1    3.0
2    NaN
3    1.0
4    3.0
5    NaN
6    2.0
7    1.0
8    NaN
Name: NUM_BEDROOMS, dtype: float64


In [38]:
print(df["NUM_BEDROOMS"].isnull())

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
Name: NUM_BEDROOMS, dtype: bool


In [39]:
# 删除包含了空数据的行(默认情况下，dropna()方法返回一个新的DateFrame，不会修改源数据；如果要修改源数据DataFrame，可以使用inplace=True参数)
import pandas as pd

df = pd.read_csv("./file/property-data.csv")

df1 = df.dropna()
# 修改源数据
# df1 = df.dropna(inplace=True)

print(df1.to_string())

           PID  ST_NUM    ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0     PUTNAM            Y            3        1  1000
1  100002000.0   197.0  LEXINGTON            N            3      1.5    --
8  100009000.0   215.0    TREMONT            Y           na        2  1800


In [40]:
# 移除ST_NUM列中字段值为空的行：
df = pd.read_csv("./file/property-data.csv")
df2 = df.dropna(subset="ST_NUM")
print(df2.to_string())

           PID  ST_NUM    ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0     PUTNAM            Y            3        1  1000
1  100002000.0   197.0  LEXINGTON            N            3      1.5    --
3  100004000.0   201.0   BERKELEY           12            1      NaN   700
4          NaN   203.0   BERKELEY            Y            3        2  1600
5  100006000.0   207.0   BERKELEY            Y          NaN        1   800
7  100008000.0   213.0    TREMONT            Y            1        1   NaN
8  100009000.0   215.0    TREMONT            Y           na        2  1800


In [41]:
# 使用fillna()方法来替换空字段

df = pd.read_csv("./file/property-data.csv")
print(df.to_string())
df3 = df.fillna(12345)
print(df3.to_string())

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5    --
2  100003000.0     NaN   LEXINGTON            N          NaN        1   850
3  100004000.0   201.0    BERKELEY           12            1      NaN   700
4          NaN   203.0    BERKELEY            Y            3        2  1600
5  100006000.0   207.0    BERKELEY            Y          NaN        1   800
6  100007000.0     NaN  WASHINGTON          NaN            2   HURLEY   950
7  100008000.0   213.0     TREMONT            Y            1        1   NaN
8  100009000.0   215.0     TREMONT            Y           na        2  1800
           PID   ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH  SQ_FT
0  100001000.0    104.0      PUTNAM            Y            3        1   1000
1  100002000.0    197.0   LEXINGTON            N            3      1.5     --
2  100

In [42]:
# 指定某一列来替换数据
df = pd.read_csv("./file/property-data.csv")
df4 = df["PID"].fillna(654321)
print(df4.to_string())

0    100001000.0
1    100002000.0
2    100003000.0
3    100004000.0
4       654321.0
5    100006000.0
6    100007000.0
7    100008000.0
8    100009000.0


## 数据替换

替换空单元格的常用方法是计算列的均值、中位数或众数

Pandas使用mean()、median()和mode()方法计算列的均值（所有值加起来的平均值）、中位数（排序后排在中间的数）和众数（出现频率最高的数）

### mean()方法计算列的均值并替换空单元格

In [43]:
import pandas as pd

df = pd.read_csv("./file/property-data.csv")
print(df.to_string())

           PID  ST_NUM     ST_NAME OWN_OCCUPIED NUM_BEDROOMS NUM_BATH SQ_FT
0  100001000.0   104.0      PUTNAM            Y            3        1  1000
1  100002000.0   197.0   LEXINGTON            N            3      1.5    --
2  100003000.0     NaN   LEXINGTON            N          NaN        1   850
3  100004000.0   201.0    BERKELEY           12            1      NaN   700
4          NaN   203.0    BERKELEY            Y            3        2  1600
5  100006000.0   207.0    BERKELEY            Y          NaN        1   800
6  100007000.0     NaN  WASHINGTON          NaN            2   HURLEY   950
7  100008000.0   213.0     TREMONT            Y            1        1   NaN
8  100009000.0   215.0     TREMONT            Y           na        2  1800


In [44]:
x = df["ST_NUM"].mean()
df1 = df["ST_NUM"].fillna(x)
print(df1.to_string())

0    104.000000
1    197.000000
2    191.428571
3    201.000000
4    203.000000
5    207.000000
6    191.428571
7    213.000000
8    215.000000


### mode()方法计算列的众数并替换空单元格

In [45]:
x = df["ST_NUM"].mode()
df1 = df["ST_NUM"].fillna(x)
print(df1.to_string())

0    104.0
1    197.0
2    201.0
3    201.0
4    203.0
5    207.0
6    215.0
7    213.0
8    215.0


## Pandas清洗格式错误数据

In [46]:
# 格式化日期
import pandas as pd

# 第三个日期格式错误
data = {"Date": ["2020/12/01", "2021/11/12", "20311026"], "duration": [12, 34, 56]}

df = pd.DataFrame(data, index=["day1", "day2", "day3"])
print(df.to_string())
df["Date"] = pd.to_datetime(df["Date"])
print(df.to_string())

            Date  duration
day1  2020/12/01        12
day2  2021/11/12        34
day3    20311026        56
           Date  duration
day1 2020-12-01        12
day2 2021-11-12        34
day3 2031-10-26        56


## Pandas清洗错误数据

In [47]:
# 对错误的数据进行替换和移除，以下会替换错误年龄的数据
import pandas as pd

person = {
    "name": ["Google", "Baidu", "Amzon", "QQ", "WeChat"],
    "age": [122, 34, 23, 123, 567],
}

df = pd.DataFrame(person)
# 修改数据
df.loc[2, "age"] = 30
print(df)

mean = df["age"].mean()
for i in df.index:
    if df.loc[i, "age"] > 100:
        df.loc[i, "age"] = mean
print(df)

     name  age
0  Google  122
1   Baidu   34
2   Amzon   30
3      QQ  123
4  WeChat  567
     name    age
0  Google  175.2
1   Baidu   34.0
2   Amzon   30.0
3      QQ  175.2
4  WeChat  175.2


In [48]:
# 将错误数据行删除
import pandas as pd

person = {
    "name": ["Google", "Baidu", "Amzon", "QQ", "WeChat"],
    "age": [122, 34, 23, 123, 567],
}

df = pd.DataFrame(person)

for i in df.index:
    if df.loc[i, "age"] > 100:
        df.drop(i, inplace=True)
print(df)

    name  age
1  Baidu   34
2  Amzon   23


## Pandas清洗重复数据

如果要清洗重复数据，可以使用duplicated()和drop_duplicates()方法

如果对应的数据是重复的，duplicated()会返回True,否则会返回false

In [49]:
import pandas as pd

person = {
    "name": ["Google", "Baidu", "Amzon", "QQ", "WeChat"],
    "age": [122, 34, 23, 123, 567],
}

df = pd.DataFrame(person)

print(df.to_string())
print()
# 查看是否有重复数据
print(df.duplicated())

     name  age
0  Google  122
1   Baidu   34
2   Amzon   23
3      QQ  123
4  WeChat  567

0    False
1    False
2    False
3    False
4    False
dtype: bool


In [50]:
person = {
    "name": ["Google", "QQ", "Amzon", "QQ", "WeChat"],
    "age": [122, 34, 23, 34, 567],
}

df = pd.DataFrame(person)

print(df.to_string())
print()
# 删除重复数据
print(df.drop_duplicates())

     name  age
0  Google  122
1      QQ   34
2   Amzon   23
3      QQ   34
4  WeChat  567

     name  age
0  Google  122
1      QQ   34
2   Amzon   23
4  WeChat  567
