# 第4章　例外値への対応

## 4.1 欠損値の表現とその確認方法

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('sample_4_1.csv')

df

Unnamed: 0,A,B,C,D,E
0,11,12.0,13.0,14.0,15
1,21,,,,NAN
2,31,32.0,,,na
3,41,42.0,,44.0,Null
4,51,52.0,,54.0,55
5,61,62.0,,64.0,65
6,71,72.0,73.0,74.0,75


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       7 non-null      int64  
 1   B       6 non-null      float64
 2   C       2 non-null      float64
 3   D       5 non-null      float64
 4   E       7 non-null      object 
dtypes: float64(3), int64(1), object(1)
memory usage: 408.0+ bytes


In [3]:
df.isna()   # df.isnull()でも同一

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,False
1,False,True,True,True,False
2,False,False,True,True,False
3,False,False,True,False,False
4,False,False,True,False,False
5,False,False,True,False,False
6,False,False,False,False,False


In [4]:
df.isna().sum(axis=0)

A    0
B    1
C    5
D    2
E    0
dtype: int64

In [5]:
df.isna().sum(axis=1)

0    0
1    3
2    2
3    1
4    1
5    1
6    0
dtype: int64

In [6]:
df_replace = df.replace('NAN', np.nan)

df_replace

Unnamed: 0,A,B,C,D,E
0,11,12.0,13.0,14.0,15
1,21,,,,
2,31,32.0,,,na
3,41,42.0,,44.0,Null
4,51,52.0,,54.0,55
5,61,62.0,,64.0,65
6,71,72.0,73.0,74.0,75


In [7]:
df_replace = df.replace({
    'NAN': np.nan,
    'na': np.nan
})

df_replace

Unnamed: 0,A,B,C,D,E
0,11,12.0,13.0,14.0,15
1,21,,,,
2,31,32.0,,,
3,41,42.0,,44.0,Null
4,51,52.0,,54.0,55
5,61,62.0,,64.0,65
6,71,72.0,73.0,74.0,75


In [8]:
df.replace(
    ['NAN', 'na'],
    [np.nan, np.nan]
)

df_replace

Unnamed: 0,A,B,C,D,E
0,11,12.0,13.0,14.0,15
1,21,,,,
2,31,32.0,,,
3,41,42.0,,44.0,Null
4,51,52.0,,54.0,55
5,61,62.0,,64.0,65
6,71,72.0,73.0,74.0,75


In [9]:
df_replace = df.replace(r'NAN|na|Null', np.nan, regex=True)

df_replace

Unnamed: 0,A,B,C,D,E
0,11,12.0,13.0,14.0,15.0
1,21,,,,
2,31,32.0,,,
3,41,42.0,,44.0,
4,51,52.0,,54.0,55.0
5,61,62.0,,64.0,65.0
6,71,72.0,73.0,74.0,75.0


In [10]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5, 6],
    'B': [7, 8, 9, 10, 11, 12]
})

df

Unnamed: 0,A,B
0,1,7
1,2,8
2,3,9
3,4,10
4,5,11
5,6,12


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       6 non-null      int64
 1   B       6 non-null      int64
dtypes: int64(2)
memory usage: 224.0 bytes


In [12]:
df.loc[1, 'B'] = None
df.loc[2, 'B'] = np.nan
df.loc[3, 'B'] = pd.NA
df.loc[4, 'B'] = float('nan')

df

Unnamed: 0,A,B
0,1,7.0
1,2,
2,3,
3,4,
4,5,
5,6,12.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      int64  
 1   B       2 non-null      float64
dtypes: float64(1), int64(1)
memory usage: 224.0 bytes


In [14]:
print(np.isnan(df.loc[1, 'B']))
print(df.loc[1, 'B'].dtype)

True
float64


In [15]:
df_float32 = df.astype('float32')

df_float32.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float32
 1   B       2 non-null      float32
dtypes: float32(2)
memory usage: 176.0 bytes


In [16]:
print(np.isnan(df_float32.loc[1, 'B']))
print(df_float32.loc[1, 'B'].dtype)

True
float32
