<a href="https://colab.research.google.com/github/jacksonsin/data_science_in_python/blob/main/Data_Cleaning_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Handling Missing Data With Python

Import Libraies

In [1]:
import pandas as pd
import numpy as np

To Detect Null Values

In [2]:
pd.isnull(np.nan)

True

In [3]:
pd.isnull(None)

True

In [4]:
pd.isna(np.nan)

True

In [5]:
pd.isna(None)

True

Opposite Will Return False

In [6]:
pd.notnull(None)

False

In [7]:
pd.notnull(np.nan)

False

In [8]:
# Any value you pass in as long it is not null will return True
pd.notnull(3)

True

Working With Series and DataFrame

In [9]:
pd.isnull(pd.Series([1,np.nan,7]))

0    False
1     True
2    False
dtype: bool

In [10]:
pd.notnull(pd.Series([1,np.nan,7]))

0     True
1    False
2     True
dtype: bool

In [11]:
pd.isnull(pd.DataFrame({
  'Column A' : [1,np.nan,7],  
  'Column B' : [np.nan,2,3],  
  'Column C' : [np.nan,2,np.nan],  
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


# **Pandas Operations With Missing Values**

In [12]:
pd.Series([1,np.nan,7]).count()

2

In [13]:
pd.Series([1,np.nan,7]).sum()

8.0

In [14]:
pd.Series([1,np.nan,7]).mean()

4.0

# **Filtering Missing Data**

In [15]:
s = pd.Series([1,2,3,np.nan,np.nan,4])

In [16]:
# Check any null values in the Series
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [17]:
# Count the number of rows with no null
pd.notnull(s).count()

6

In [18]:
# Return rows that are not null
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [19]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [20]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [21]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

# **Drop Null Values**

In [22]:
# Drop values on Series
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [23]:
# Create Dataframe
df = pd.DataFrame({
  'Column A' : [1,np.nan,7],  
  'Column B' : [np.nan,2,3],  
  'Column C' : [np.nan,2,np.nan],  
})
df

Unnamed: 0,Column A,Column B,Column C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [24]:
df.shape

(3, 3)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  2 non-null      float64
 2   Column C  1 non-null      float64
dtypes: float64(3)
memory usage: 200.0 bytes


In [26]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


In [27]:
df.isnull().sum()

Column A    1
Column B    1
Column C    2
dtype: int64

In [28]:
# Drop values on DataFrame, any row with NA will be dropped
df.dropna()

Unnamed: 0,Column A,Column B,Column C


In [29]:
# Drop values on DataFrame, any row with all NA values will be dropped
df.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [30]:
# Drop values on DataFrame, any row with NA will be dropped
df.dropna(how='any')

Unnamed: 0,Column A,Column B,Column C


In [31]:
# Drop values on DataFrame, any row with 2 NAs will be dropped
df.dropna(thresh=2)

Unnamed: 0,Column A,Column B,Column C
1,,2.0,2.0
2,7.0,3.0,


In [32]:
# Drop values on DataFrame, any column with 2 NAs will be dropped
df.dropna(thresh=2, axis='columns')

Unnamed: 0,Column A,Column B
0,1.0,
1,,2.0
2,7.0,3.0


# **Fill Null Values**

In [33]:
# For series
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [34]:
# Fill null values with 0
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [35]:
# Fill null values with mean
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [36]:
# Fill null values with previous row value
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [37]:
# Fill null values with next row value
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [38]:
# For dataframe
df

Unnamed: 0,Column A,Column B,Column C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [39]:
# Fill NA for dataframe
df.fillna({'Column A' : 0, 'Column B' : 99, 'Column C' : df['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C
0,1.0,99.0,2.0
1,0.0,2.0,2.0
2,7.0,3.0,2.0


In [40]:
# Forward fill null values from left to right
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column A,Column B,Column C
0,1.0,,
1,1.0,2.0,2.0
2,7.0,3.0,2.0


In [41]:
# Forward fill null values from top to bottom
df.fillna(method='ffill', axis=1)

Unnamed: 0,Column A,Column B,Column C
0,1.0,1.0,1.0
1,,2.0,2.0
2,7.0,3.0,3.0


# **Check for NAs**

In [42]:
s.dropna().count()

4

In [43]:
missing_values = len(s.dropna()) != len(s)
missing_values

True

# **More Pythonic Solution**

In [44]:
pd.Series([True,False,False]).any()

True

In [45]:
pd.Series([True,False,False]).all()

False

In [46]:
pd.Series([True,True,True]).all()

True

# **Finding Unique Values**

In [47]:
df = pd.DataFrame({
    'Sex' : ['M','F','F','D','?'],
    'Age' : [25,30,24,290,25]
})
df

Unnamed: 0,Sex,Age
0,M,25
1,F,30
2,F,24
3,D,290
4,?,25


In [48]:
# Return unique attributes
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [49]:
# Total count
df['Sex'].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

# **Replacing Categorical Values**

In [50]:
# Replace single value
df['Sex'].replace('D','F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [51]:
# Replace single column values
df['Sex'].replace({'D':'F','N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [52]:
# Replace multiple column values
df.replace({
    'Sex' : {'D':'F','N':'M'},
    'Age':{290:29}
    })

Unnamed: 0,Sex,Age
0,M,25
1,F,30
2,F,24
3,F,29
4,?,25


In [53]:
df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [54]:
# Divide conditional value by 10
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10
df

Unnamed: 0,Sex,Age
0,M,25.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


# **Handling Duplicates**

In Pandas Series

In [55]:
ambassadors = pd.Series([
'France',
'United Kingdom',    
'United Kingdom',    
'Italy',    
'Germany',
'Germany',
'Germany'
], index=[
'Gerara Araud',
'Kim Darroch',
'Peter Westmacott',
'Armando Varricchio',
'Petter Wittig',
'Peter Ammon',
'Klaus Scharioth']
)
ambassadors

Gerara Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Petter Wittig                Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [56]:
# Flag any duplicated values
ambassadors.duplicated()

Gerara Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Petter Wittig         False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [57]:
# Flag duplicated values except the last duplicate
ambassadors.duplicated(keep='last')

Gerara Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Petter Wittig          True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [58]:
# Flag all duplicated values
ambassadors.duplicated(keep='last')

Gerara Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Petter Wittig          True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [59]:
# Drop duplicates
ambassadors.drop_duplicates(keep=False)

Gerara Araud          France
Armando Varricchio     Italy
dtype: object

In Pandas DataFrame

In [60]:
players = pd.DataFrame({
    'Name' : [
              'Kobe Bryant',
              'LeBron James',
              'Kobe Bryant',
              'Carmelo Anthony',
              'Kobe Bryant'
    ],
    'Pos' : [
             'SG',
             'SF',
             'SG',
             'SF',
             'SF'
    ]
})
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [61]:
# Flag any duplicated values
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [62]:
# Flag any duplicated values by subset
players.duplicated(subset=['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [63]:
# Flag any duplicated values except last duplicate by subset 
players.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [64]:
# Drop duplicates
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [65]:
# Drop duplicates by subset
players.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [66]:
# Drop duplicates except last duplicate by subset
players.drop_duplicates(subset=['Name'], keep='last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


# **Text Handling**

In [67]:
df = pd.DataFrame({
    'Data' : [
              '1983_M_US _1',
              '1990?_M_UK_1',
              '1992_F_US_2',
              '1970?_M_  IT_1',
              '1985_F_I  T_2'
    ]
})
df

Unnamed: 0,Data
0,1983_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [68]:
# Split tex by undercore
df['Data'].str.split('_')

0      [1983, M, US , 1]
1      [1990?, M, UK, 1]
2       [1992, F, US, 2]
3    [1970?, M,   IT, 1]
4     [1985, F, I  T, 2]
Name: Data, dtype: object

In [69]:
# Split text by undercore
df['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1983,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [70]:
df = df['Data'].str.split('_', expand=True)
df.columns = ['Year','Sex','Country','No Children']
df

Unnamed: 0,Year,Sex,Country,No Children
0,1983,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [71]:
# Check if contains ?
df['Year'].str.contains('\?')

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [72]:
# Check if contains U
df['Country'].str.contains('U')

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [73]:
# Strip white spaces
df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [74]:
# Replace white spaces
df['Country'].str.replace(' ','')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [75]:
# Date Processing by Year
df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))

  


0    1983
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object