In [8]:
import pandas as pd
import numpy as np
print(pd.__version__, np.__version__)

2.0.3 1.24.3


In [9]:
df = pd.read_csv('https://github.com/prasertcbs/basic-dataset/raw/master/msleep.csv')
df = df.sample(20, random_state=123)
df

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6
24,European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333,13.9,0.0035,0.77
37,Macaque,Macaca,omni,Primates,,10.1,1.2,0.75,13.9,0.179,6.8
76,Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1.0,0.9,19.6,0.169,207.501
31,Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,,17.8,0.325,85.0


In [10]:
df.info() # any column that is not 20 is the column that have missing value

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 69 to 81
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          20 non-null     object 
 1   genus         20 non-null     object 
 2   vore          18 non-null     object 
 3   order         20 non-null     object 
 4   conservation  14 non-null     object 
 5   sleep_total   20 non-null     float64
 6   sleep_rem     14 non-null     float64
 7   sleep_cycle   9 non-null      float64
 8   awake         20 non-null     float64
 9   brainwt       16 non-null     float64
 10  bodywt        20 non-null     float64
dtypes: float64(6), object(5)
memory usage: 1.9+ KB


## check NA in 'vore' column

In [11]:
df[df.vore.isna()]

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6
72,Musk shrew,Suncus,,Soricomorpha,,12.8,2.0,0.183333,11.2,0.00033,0.048


# sklearn: SimpleImputer

In [12]:
# we can also use pd.fillna but imputer will provide wider range of data type not only df
from sklearn.impute import SimpleImputer

In [13]:
imp = SimpleImputer(strategy='most_frequent') # แทนที่ vore ที่หายไปด้วย vore ที่มีอยู่มากที่สุดแทน
imp.fit_transform(df[['vore']]) # enter df [[]] as the trasform could manage the vore

array([['herbi'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['omni'],
       ['herbi'],
       ['carni'],
       ['carni'],
       ['carni'],
       ['herbi'],
       ['herbi'],
       ['carni'],
       ['herbi'],
       ['herbi'],
       ['omni'],
       ['herbi'],
       ['carni']], dtype=object)

In [14]:
#  instead of just type another array we concat it as a column
imp = SimpleImputer(strategy='most_frequent')
df['vore2'] = imp.fit_transform(df[['vore']]).tolist() # sometime it couldn't work so put .tolist() to transform 
# np.array to list then assign to df
df[df.vore.isna()][['name','vore','vore2']]

Unnamed: 0,name,vore,vore2
62,Rock hyrax,,[herbi]
72,Musk shrew,,[herbi]


In [15]:
imp2 = SimpleImputer(strategy='constant', fill_value='omni')
df['vore3']=imp2.fit_transform(df[['vore']]).tolist()
df

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2,vore3
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92,[herbi],[herbi]
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112,[herbi],[herbi]
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,[herbi],[herbi]
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104,[omni],[omni]
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995,[herbi],[herbi]
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6,[herbi],[omni]
24,European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333,13.9,0.0035,0.77,[omni],[omni]
37,Macaque,Macaca,omni,Primates,,10.1,1.2,0.75,13.9,0.179,6.8,[omni],[omni]
76,Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1.0,0.9,19.6,0.169,207.501,[herbi],[herbi]
31,Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,,17.8,0.325,85.0,[carni],[carni]


In [17]:
df

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2,vore3
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92,[herbi],[herbi]
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112,[herbi],[herbi]
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,[herbi],[herbi]
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104,[omni],[omni]
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995,[herbi],[herbi]
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6,[herbi],[omni]
24,European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333,13.9,0.0035,0.77,[omni],[omni]
37,Macaque,Macaca,omni,Primates,,10.1,1.2,0.75,13.9,0.179,6.8,[omni],[omni]
76,Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1.0,0.9,19.6,0.169,207.501,[herbi],[herbi]
31,Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,,17.8,0.325,85.0,[carni],[carni]


In [19]:
# if NaN is suppost to be number
imp3 = SimpleImputer(strategy='mean')
# imp3 = SimpleImputer(strategy='median')
# imp3 = SimpleImputer(strategy='constant', fill_value=-99)
df['sleep_rem2'] = imp3.fit_transform(df[['sleep_rem']]).tolist()
df

Unnamed: 0,name,genus,vore,order,conservation,sleep_total,sleep_rem,sleep_cycle,awake,brainwt,bodywt,vore2,vore3,sleep_rem2
69,Arctic ground squirrel,Spermophilus,herbi,Rodentia,lc,16.6,,,7.4,0.0057,0.92,[herbi],[herbi],[2.0]
75,Eastern american chipmunk,Tamias,herbi,Rodentia,,15.8,,,8.2,,0.112,[herbi],[herbi],[2.0]
4,Cow,Bos,herbi,Artiodactyla,domesticated,4.0,0.7,0.666667,20.0,0.423,600.0,[herbi],[herbi],[0.7]
78,Tree shrew,Tupaia,omni,Scandentia,,8.9,2.6,0.233333,15.1,0.0025,0.104,[omni],[omni],[2.6]
29,Giraffe,Giraffa,herbi,Artiodactyla,cd,1.9,0.4,,22.1,,899.995,[herbi],[herbi],[0.4]
62,Rock hyrax,Procavia,,Hyracoidea,lc,5.4,0.5,,18.6,0.021,3.6,[herbi],[omni],[0.5]
24,European hedgehog,Erinaceus,omni,Erinaceomorpha,lc,10.1,3.5,0.283333,13.9,0.0035,0.77,[omni],[omni],[3.5]
37,Macaque,Macaca,omni,Primates,,10.1,1.2,0.75,13.9,0.179,6.8,[omni],[omni],[1.2]
76,Brazilian tapir,Tapirus,herbi,Perissodactyla,vu,4.4,1.0,0.9,19.6,0.169,207.501,[herbi],[herbi],[1.0]
31,Gray seal,Haliochoerus,carni,Carnivora,lc,6.2,1.5,,17.8,0.325,85.0,[carni],[carni],[1.5]


In [20]:
# or fill the missing value with NaN before assigning the value
imp_x = SimpleImputer(missing_value=-99, strategy='constant', fill_value=np.nan)

df['sleep_rem3'] = imp_x.fit_transform(df[['sleep_rem']]).tolist()
df

TypeError: SimpleImputer.__init__() got an unexpected keyword argument 'missing_value'