In [1]:
import numpy as np
import pandas as pd

# 一 问题重现

In [2]:
df = pd.DataFrame([
        [1, 'Tom', 32.1, np.nan],
        [2, 'Mary', 65, np.nan],
        [3, 'Jane', 89, 48],
        [4, 'Jim', 63, np.nan],
        [5, 'Hem', 89, 43],
        [6, 'Bob', 89, 43],
    ], columns=['index', 'name', 'chn', 'math'])
df

Unnamed: 0,index,name,chn,math
0,1,Tom,32.1,
1,2,Mary,65.0,
2,3,Jane,89.0,48.0
3,4,Jim,63.0,
4,5,Hem,89.0,43.0
5,6,Bob,89.0,43.0


In [3]:
set(df['chn'])

{32.1, 63.0, 65.0, 89.0}

In [4]:
set(df['math'])

{nan, nan, nan, 43.0, 48.0}

In [5]:
df['math'].unique()

array([nan, 48., 43.])

# 二 问题定位

## 2.1 pd.Series改变了np.nan值

In [6]:
df.iloc[0, -1], df.iloc[1, -1], df.iloc[3, -1]

(nan, nan, nan)

In [7]:
np.isnan(df.iloc[0, -1]), np.isnan(df.iloc[1, -1]), np.isnan(df.iloc[3, -1])

(True, True, True)

## 2.2 np.nan是个坏家伙

In [8]:
np.nan == np.nan

False

In [9]:
print(np.nan == 0)
print(np.nan < 0)
print(np.nan != 0)

False
False
True


In [10]:
# 扩展

In [11]:
print(np.nan is np.nan)
print(np.isnan(np.nan))

True
True


## 2.3 set计算错误

In [12]:
set([np.nan, np.nan, np.nan])

{nan}

## 2.4 问题是什么

In [13]:
set(np.array([np.nan, np.nan, np.nan]))

{nan, nan, nan}

In [14]:
set(np.array([1, 2, 1, np.nan, np.nan, np.nan]))

{nan, 1.0, 2.0, nan, nan}

In [15]:
np.array([np.nan, np.nan])[0] is np.nan

False

In [16]:
type(np.nan)

float

In [17]:
type(np.array([np.nan, np.nan])[0])

numpy.float64

In [18]:
float(np.array([np.nan, np.nan])[0]) is np.nan

False

# 三 问题解析

In [19]:
tmp_arr = np.array([1, 2.0, 5.0, 1.0])
tmp_arr[1] == 2, type(tmp_arr[1]), set(tmp_arr)

(True, numpy.float64, {1.0, 2.0, 5.0})

In [20]:
tmp_arr = np.array(["tell me about", "tell me about", "tell me about"])
tmp_arr[1] == 2, type(tmp_arr[1]), set(tmp_arr)

(False, numpy.str_, {'tell me about'})

In [21]:
a = 123
a_in_arr = np.array([a, a])
id(123), id(a), id(a_in_arr[0]), id(a_in_arr[1])

(94333536607360, 94333536607360, 139976147981616, 139976147981616)

In [43]:
a = np.nan
a_in_arr = np.array([a, a])
id(123), id(a), id(a_in_arr[0]), id(a_in_arr[1])

(94333536607360, 139976742622960, 139976147784464, 139976147784464)

In [25]:
import math
# 同理：math.nan

In [26]:
# 提前说明：math.nan 和 np.nan不是一个东西
math.nan == np.nan, math.nan is np.nan

(False, False)

In [27]:
set([math.nan, math.nan]), set(np.array([math.nan, math.nan]))

({nan}, {nan, nan})

In [28]:
id(math.nan), id(np.array([math.nan, math.nan])[0])

(139976807434256, 139976147981392)

In [29]:
#问题根源： is 要地址相等

In [30]:
id(np.nan), id(np.array([np.nan])[0])

(139976742622960, 139976147983248)

In [31]:
# what's more

In [32]:
tmp_arr = np.array([None, None, 1])
type(tmp_arr[0]), set(tmp_arr)

(NoneType, {1, None})

In [33]:
tmp_arr

array([None, None, 1], dtype=object)

In [34]:
set(np.array([np.nan, None, np.nan, 3]))

{3, None, nan}