# Handling missing values

In [151]:
import pandas as pd

In [152]:
df = pd.read_csv("./data/landslides.csv")
df.head()

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0


## Check and handle missing values

In [153]:
# Identify the columns with missing values and determine total number of missing values.
#
# Treat the missing values: Drop them or Fill them.
#       Drop rows where you encounter missing values in the 'date' column
#       Fill missing values in the 'time' column with a specific value 'Not Known' literal
#       Fill missing values in the 'fatalities' column with median/mean of the column

#### Identify the columns with missing values and determine total number of missing values:

In [154]:
# info() method provides us some first glance insights
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1693 entries, 0 to 1692
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1693 non-null   int64  
 1   date            1690 non-null   object 
 2   time            629 non-null    object 
 3   country_name    1693 non-null   object 
 4   state/province  1692 non-null   object 
 5   population      1693 non-null   int64  
 6   landslide_type  1692 non-null   object 
 7   trigger         1691 non-null   object 
 8   fatalities      1446 non-null   float64
dtypes: float64(1), int64(2), object(6)
memory usage: 119.2+ KB


In [155]:
# but this is even better:
df.isna().sum() 

id                   0
date                 3
time              1064
country_name         0
state/province       1
population           0
landslide_type       1
trigger              2
fatalities         247
dtype: int64

In [156]:
# done!

#### Drop rows where you encounter missing values in the 'date' column:
##### (I'll solve it in four different ways)

##### 1st way:

In [157]:
# this will be our mask (for boolean indexing)
df['date'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
1688    False
1689    False
1690    False
1691    False
1692    False
Name: date, Length: 1693, dtype: bool

In [158]:
# these rows will be dropped
df[ df['date'].isna() ]

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
1482,7042,,Morning,United States,Kansas,857,Mudslide,Downpour,0.0
1497,7080,,13:00,United States,Ohio,4113,Landslide,Unknown,0.0
1526,7165,,12:15,United States,Indiana,2085,Landslide,Rain,0.0


In [159]:
# reverting the mask (negation)
# this will be the resulting dataframe
df[ ~df['date'].isna() ] 

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [160]:
df = df[ ~df['date'].isna() ] 
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


##### 2nd way:

In [161]:
# NOTE:
# we could do it even more efficiently with .notna() method
# .notna() will be our mask (so we don't need the negation)

In [162]:
# let's read the original dataframe / CSV
df = pd.read_csv("./data/landslides.csv")

In [163]:
df['date'].notna() # will be our boolean mask

0       True
1       True
2       True
3       True
4       True
        ... 
1688    True
1689    True
1690    True
1691    True
1692    True
Name: date, Length: 1693, dtype: bool

In [164]:
df = df[ df['date'].notna() ]
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [165]:
# done!

##### 3rd way:

In [166]:
# NOTE:
# yet another option is to use: .drop(), .dropna() methods:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

In [167]:
# let's read the original dataframe / CSV
df = pd.read_csv("./data/landslides.csv")

In [168]:
df[df['date'].isna()].index # these indices will be dropped

Index([1482, 1497, 1526], dtype='int64')

In [169]:
df.drop( df[df['date'].isna()].index , inplace=True)
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


##### 4th way:

In [170]:
# let's read the original dataframe / CSV
df = pd.read_csv("./data/landslides.csv")

In [171]:
df.dropna(subset='date', inplace=True) # probably the easiest way!
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [172]:
# done!

#### Fill missing values in the 'time' column with a specific value 'Not Known' literal

In [173]:
# let's see what values occur in the 'time' column
df['time'].value_counts()

time
Night            97
Morning          87
Afternoon        58
Early morning    36
3:00:00          12
                 ..
1:13              1
9:40:00           1
11:50:00          1
                  1
21:06             1
Name: count, Length: 159, dtype: int64

In [174]:
df['time'].isna().sum() # we have 1064 NA values

1064

In [175]:
# filling it with some literal
df['time'].fillna('Not Known', inplace=True)
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,Not Known,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,Not Known,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,Not Known,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,Not Known,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,Not Known,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,Not Known,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


#### Fill missing values in the 'fatalities' column with median/mean of the column

In [176]:
df['fatalities'].isna().sum() # we have 247 NaN values

247

In [177]:
df['fatalities'].value_counts(dropna=False).head(25) # frequency of value's occurence

fatalities
0.0     1182
NaN      247
1.0       49
2.0       47
3.0       40
4.0       19
5.0       17
6.0       12
8.0       10
7.0        9
9.0        6
13.0       6
10.0       6
11.0       6
23.0       4
14.0       4
12.0       3
17.0       2
25.0       2
20.0       2
92.0       1
71.0       1
48.0       1
32.0       1
16.0       1
Name: count, dtype: int64

In [178]:
df['fatalities'].describe() # Descriptive statistics

count    1443.000000
mean        1.462231
std         9.193961
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       280.000000
Name: fatalities, dtype: float64

In [179]:
# calculating mean
my_mean = df['fatalities'].mean()
my_mean

1.4622314622314623

In [180]:
# filling NA values with mean.
df['fatalities'].fillna( my_mean , inplace=True)
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,1.462231
1,42,3/22/07,Not Known,United States,Ohio,17288,Landslide,Rain,1.462231
2,56,4/6/07,Not Known,United States,Pennsylvania,15930,Landslide,Rain,1.462231
3,59,4/14/07,Not Known,Canada,Quebec,42786,Riverbank collapse,Rain,1.462231
4,61,4/15/07,Not Known,United States,Kentucky,6903,Landslide,Downpour,0.000000
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,Not Known,United States,North Carolina,1646,Rockfall,,0.000000
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.000000
1690,7539,2/23/16,Not Known,United States,West Virginia,2406,Landslide,Rain,0.000000
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.000000


In [181]:
df['fatalities'].describe()
# mean stayed the same, but std slightly differs...

count    1690.000000
mean        1.462231
std         8.495138
min         0.000000
25%         0.000000
50%         0.000000
75%         1.462231
max       280.000000
Name: fatalities, dtype: float64

In [182]:
# done.