In [1]:
import pandas as pd  # 导入pandas库
import numpy as np  # 导入numpy库
from sklearn.impute import SimpleImputer  # 导入sklearn中SimpleImputer库

In [2]:
# 生成一份数据
df = pd.DataFrame(np.random.randn(6,4),columns=['col1','col2','col3','col4'])
df.iloc[1:2,1] = np.nan
df.iloc[4,3] = np.nan
print(df)

       col1      col2      col3      col4
0 -0.475836 -0.579811 -0.157547 -0.050251
1 -0.582252       NaN -0.661201 -0.428627
2  0.073883 -0.661053  0.407365 -0.894520
3  0.490865 -0.227238 -1.190658 -0.272819
4 -0.954068 -0.783638 -0.860093       NaN
5  1.251227  1.378393 -0.287999 -0.134500


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
col1    6 non-null float64
col2    5 non-null float64
col3    6 non-null float64
col4    5 non-null float64
dtypes: float64(4)
memory usage: 272.0 bytes


In [4]:
nan_all = df.isnull()
print(nan_all)

    col1   col2   col3   col4
0  False  False  False  False
1  False   True  False  False
2  False  False  False  False
3  False  False  False  False
4  False  False  False   True
5  False  False  False  False


In [5]:
nan_col1 = df.isnull().any() #获得含有NA的列
nan_col2 = df.isnull().all() #获得所有的NA列
print(nan_col1)
print(nan_col2)

col1    False
col2     True
col3    False
col4     True
dtype: bool
col1    False
col2    False
col3    False
col4    False
dtype: bool


In [6]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
col1    6 non-null float64
col2    5 non-null float64
col3    6 non-null float64
col4    5 non-null float64
dtypes: float64(4)
memory usage: 272.0 bytes


In [7]:
df2 = df.dropna()  #丢弃所有的NA行记录
print(df2)
# 使用sklearn填充缺失值
nan_model = SimpleImputer(missing_values=np.nan,strategy='mean')
# 建立替换规则：将值NAN得缺失值以均值做替换`
nan_result = nan_model.fit_transform(df)  #应用模型规则
print(nan_result) #打印输出


       col1      col2      col3      col4
0 -0.475836 -0.579811 -0.157547 -0.050251
2  0.073883 -0.661053  0.407365 -0.894520
3  0.490865 -0.227238 -1.190658 -0.272819
5  1.251227  1.378393 -0.287999 -0.134500
[[-0.47583576 -0.57981079 -0.15754686 -0.05025126]
 [-0.58225184 -0.17466926 -0.6612015  -0.42862676]
 [ 0.07388316 -0.66105297  0.40736517 -0.89451951]
 [ 0.49086549 -0.22723756 -1.19065848 -0.27281945]
 [-0.95406803 -0.78363822 -0.86009265 -0.35614336]
 [ 1.25122744  1.37839322 -0.28799942 -0.13449981]]


In [10]:
nan_result_pd1 = df.fillna(method='backfill')  # 用后面的值替换缺失值
nan_result_pd2 = df.fillna(method='bfill', limit=1)  # 用后面的值替代缺失值,限制每列只能替代一个缺失值
nan_result_pd3 = df.fillna(method='pad')  # 用前面的值替换缺失值
nan_result_pd4 = df.fillna(0)  # 用0替换缺失值
nan_result_pd5 = df.fillna({'col2': 1.1, 'col4': 1.2})  # 用不同值替换不同列的缺失值
nan_result_pd6 = df.fillna(df.mean()['col2':'col4'])  # 用平均数代替,选择各自列的均值替换缺失值
# 打印输出

In [11]:
print(nan_result_pd1)  # 打印输出
print(nan_result_pd2)  # 打印输出
print(nan_result_pd3)  # 打印输出
print(nan_result_pd4)  # 打印输出
print(nan_result_pd5)  # 打印输出
print(nan_result_pd6)  # 打印输出

       col1      col2      col3      col4
0  1.160602  0.953626 -0.650106  0.499808
1 -0.537069 -0.149276 -0.008106  0.306799
2  0.515179 -0.149276  0.965432  0.281302
3 -0.670811 -0.920188  0.340141  1.147068
4  0.149484  0.321595  0.654945  1.725640
5 -1.201555 -0.322892 -0.645427  1.725640
       col1      col2      col3      col4
0  1.160602  0.953626 -0.650106  0.499808
1 -0.537069 -0.149276 -0.008106  0.306799
2  0.515179 -0.149276  0.965432  0.281302
3 -0.670811 -0.920188  0.340141  1.147068
4  0.149484  0.321595  0.654945  1.725640
5 -1.201555 -0.322892 -0.645427  1.725640
       col1      col2      col3      col4
0  1.160602  0.953626 -0.650106  0.499808
1 -0.537069  0.953626 -0.008106  0.306799
2  0.515179 -0.149276  0.965432  0.281302
3 -0.670811 -0.920188  0.340141  1.147068
4  0.149484  0.321595  0.654945  1.147068
5 -1.201555 -0.322892 -0.645427  1.725640
       col1      col2      col3      col4
0  1.160602  0.953626 -0.650106  0.499808
1 -0.537069  0.000000 -0.008106  0

In [13]:
df2 = pd.DataFrame({'col1':[1, 120, 3, 5, 2, 12, 13],'col2':[12, 17, 31, 53, 22, 32, 43]})
print(df2)


   col1  col2
0     1    12
1   120    17
2     3    31
3     5    53
4     2    22
5    12    32
6    13    43


In [14]:
# 通过Z-SCORE  方法判断异常值
df_zscore = df2.copy()  #复制一个来储存得分数据框
# 获得数据框的列名
cols = df2.columns
# 开始循环读取每一列
for col in cols:
    df_col = df2[col]
    z_score = (df_col - df_col.mean()) / df_col.std()  #计算每一列Z -score的得分
    df_zscore[col] = z_score.abs()>2.2  #是否大于2.2 是 true  否 False
print(df_zscore)

    col1   col2
0  False  False
1   True  False
2  False  False
3  False  False
4  False  False
5  False  False
6  False  False


In [15]:

# 删除异常的值
df_drop  = df2[df_zscore['col1'] == False]
print(df_drop)



   col1  col2
0     1    12
2     3    31
3     5    53
4     2    22
5    12    32
6    13    43


In [16]:

# 有这么一份数据
data1,data2,data3 ,data4 =  ['a', 3], ['b', 2], ['a', 3], ['c', 2]
df3 = pd.DataFrame([data1,data2,data3 ,data4],columns=['col1','col2'])
print(df3)



  col1  col2
0    a     3
1    b     2
2    a     3
3    c     2


In [17]:
chongfu = df3.duplicated()
print(chongfu)
print(df3)

0    False
1    False
2     True
3    False
dtype: bool
  col1  col2
0    a     3
1    b     2
2    a     3
3    c     2


In [19]:
# 删除重复值
print(df.drop_duplicates())
print(df.drop_duplicates(['col11']))
print(df.drop_duplicates(['col22']))
print(df.drop_duplicates(['col11','col22']))

       col1      col2      col3      col4
0  1.160602  0.953626 -0.650106  0.499808
1 -0.537069       NaN -0.008106  0.306799
2  0.515179 -0.149276  0.965432  0.281302
3 -0.670811 -0.920188  0.340141  1.147068
4  0.149484  0.321595  0.654945       NaN
5 -1.201555 -0.322892 -0.645427  1.725640


KeyError: Index(['col11'], dtype='object')

  col11  col22
0     a      3
1     b      2
2     a      3
3     c      2
0    False
1    False
2     True
3    False
dtype: bool
       col1      col2      col3      col4
0  1.160602  0.953626 -0.650106  0.499808
1 -0.537069       NaN -0.008106  0.306799
2  0.515179 -0.149276  0.965432  0.281302
3 -0.670811 -0.920188  0.340141  1.147068
4  0.149484  0.321595  0.654945       NaN
5 -1.201555 -0.322892 -0.645427  1.725640


KeyError: Index(['col11'], dtype='object')