# Handling missing values - Exercises

# Preparations

In [1]:
import pandas as pd

pd.set_option("display.max_columns", 500)

# Exercise 1

1. Load the first sheet of the Excel file "wdi_wrong_types.xlsx" into a pandas DataFrame.
2. What is the column with the most missing values?
3. What are the rows with the most missing values in the respective cells?
4. How many cells are missing for each row on average?
5. Create a copy of the DataFrame, dropping all **rows** that have at least one missing value.
6. Create a copy of the DataFrame, dropping all **columns** that have at least one missing value.
7. Bonus: Create a copy of the DataFrame, dropping all **columns** that have at least 30% missing values.
8. Bonus: Create a copy of the DataFrame, dropping all **rows** that have missing values in the following columns: *countryname* and *SP_URB_TOTL_IN_ZS*.
9. Bonus: Considering **'unusual' missing values** like 'NOT AVAILABLE' (in *SP_DYN_LE00_IN*) and -99 (in *CM_MKT_LCAP_CD*), produce a DataFrame with all **rows** removed that have at least one missing value.


## 1. Load the first sheet of the Excel file "wdi_wrong_types.csv" into a pandas DataFrame.

In [2]:
# load the following tab-separated text file
df = pd.read_excel("../../data/raw/wdi_wrong_types.xlsx")
df.head()

Unnamed: 0,countryname,countrycode,year,pub_date,euro_area,incomegroup_and_id,CM_MKT_LCAP_CD,SP_DYN_LE00_IN,SP_URB_TOTL_IN_ZS,pub_date2,pub_date3,pub_date4
0,,ARM,2012,2013-10-04,,Lower middle income|3,-99,NOT AVAILABLE,,2013.10.4,2013/4/10,41013
1,Armenia,ARM,2011,2012-08-20,,Lower middle income|3,-99,74.34283,,2012.8.20,2012/20/8,200812
2,Austria,AUT,2013,2014-03-03,EURO,High income|1,-99,81.13659,65.884,2014.3.3,2014/3/3,30314
3,Austria,AUT,2012,2013-12-25,EURO,High income|1,-99,80.93659,,2013.12.25,2013/25/12,251213
4,Austria,AUT,2014,2015-03-22,EURO,High income|1,"9,679033e+10",81.49024,65.919,2015.3.22,2015/22/3,220315


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   countryname         145 non-null    object        
 1   countrycode         154 non-null    object        
 2   year                154 non-null    int64         
 3   pub_date            154 non-null    datetime64[ns]
 4   euro_area           81 non-null     object        
 5   incomegroup_and_id  154 non-null    object        
 6   CM_MKT_LCAP_CD      154 non-null    object        
 7   SP_DYN_LE00_IN      154 non-null    object        
 8   SP_URB_TOTL_IN_ZS   118 non-null    float64       
 9   pub_date2           154 non-null    object        
 10  pub_date3           154 non-null    object        
 11  pub_date4           154 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 14.6+ KB


## 2. What is the column with the most missing values?

**Note**: For now, we consider only the missing values directly identified as such by pandas. Specifically, we will later also treat values such as -99 in *CM_MKT_LCAP_CD* and 'NOT AVAILABLE' in *SP_DYN_LE00_IN* as missing!

In [4]:
# describe applies only to the numeric variables; the first entry has the most missings if sorted by "count"
df.describe().transpose().sort_values("count", ascending=True)

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
SP_URB_TOTL_IN_ZS,118.0,73.425025,35.403,64.47725,73.8815,81.23475,100.0,13.982085
year,154.0,2013.227273,2011.0,2012.0,2013.0,2015.0,2016.0,1.750902
pub_date,154.0,2014-09-12 19:00:46.753246720,2012-01-02 00:00:00,2013-01-26 00:00:00,2014-06-25 12:00:00,2016-04-16 06:00:00,2017-12-15 00:00:00,
pub_date4,154.0,132653.837662,10217.0,70264.0,130667.0,191038.5,251213.0,71958.82346


In [5]:
# the other variables we can either check with info, manually:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   countryname         145 non-null    object        
 1   countrycode         154 non-null    object        
 2   year                154 non-null    int64         
 3   pub_date            154 non-null    datetime64[ns]
 4   euro_area           81 non-null     object        
 5   incomegroup_and_id  154 non-null    object        
 6   CM_MKT_LCAP_CD      154 non-null    object        
 7   SP_DYN_LE00_IN      154 non-null    object        
 8   SP_URB_TOTL_IN_ZS   118 non-null    float64       
 9   pub_date2           154 non-null    object        
 10  pub_date3           154 non-null    object        
 11  pub_date4           154 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 14.6+ KB


In [6]:
df.isna().sum()

countryname            9
countrycode            0
year                   0
pub_date               0
euro_area             73
incomegroup_and_id     0
CM_MKT_LCAP_CD         0
SP_DYN_LE00_IN         0
SP_URB_TOTL_IN_ZS     36
pub_date2              0
pub_date3              0
pub_date4              0
dtype: int64

In [7]:
df.isna().sum().sort_values(ascending=False).head(1)

euro_area    73
dtype: int64

## 3. What are the rows with the most missing values in the respective cells?

In [8]:
df["missing_values"] = df.isna().sum(axis="columns")
df.sort_values("missing_values", ascending=False).head(3)

Unnamed: 0,countryname,countrycode,year,pub_date,euro_area,incomegroup_and_id,CM_MKT_LCAP_CD,SP_DYN_LE00_IN,SP_URB_TOTL_IN_ZS,pub_date2,pub_date3,pub_date4,missing_values
0,,ARM,2012,2013-10-04,,Lower middle income|3,-99,NOT AVAILABLE,,2013.10.4,2013/4/10,41013,3
1,Armenia,ARM,2011,2012-08-20,,Lower middle income|3,-99,74.34283,,2012.8.20,2012/20/8,200812,2
25,,CAN,2012,2013-07-07,,High income|1,"2,059974e+12",81.56244,81.293,2013.7.7,2013/7/7,70713,2


## 4. How many cells are missing for each row on average?

In [9]:
df["missing_values"].mean()

np.float64(0.7662337662337663)

## 5. Create a copy of the DataFrame, dropping all **rows** that have at least one missing value.

In [10]:
df_new = df.dropna()
df_new.shape

(61, 13)

## 6. Create a copy of the DataFrame, dropping all **columns** that have at least one missing value.

In [11]:
df_new = df.dropna(axis="columns")
df_new.shape

(154, 10)

## 7. Bonus: Create a copy of the DataFrame, dropping all **columns** that have at least 30% missing values.

In [12]:
df_new = df.dropna(axis="columns", thresh=round(len(df) * 0.7))
df_new.head(3)

Unnamed: 0,countryname,countrycode,year,pub_date,incomegroup_and_id,CM_MKT_LCAP_CD,SP_DYN_LE00_IN,SP_URB_TOTL_IN_ZS,pub_date2,pub_date3,pub_date4,missing_values
0,,ARM,2012,2013-10-04,Lower middle income|3,-99,NOT AVAILABLE,,2013.10.4,2013/4/10,41013,3
1,Armenia,ARM,2011,2012-08-20,Lower middle income|3,-99,74.34283,,2012.8.20,2012/20/8,200812,2
2,Austria,AUT,2013,2014-03-03,High income|1,-99,81.13659,65.884,2014.3.3,2014/3/3,30314,0


In [13]:
set(df.columns) - set(df_new.columns)

{'euro_area'}

## 8. Bonus: Create a copy of the DataFrame, dropping all **rows** that have missing values in the following columns: *countryname* and *SP_URB_TOTL_IN_ZS*.

In [14]:
df_new = df.dropna(axis="rows", subset=["countryname", "SP_URB_TOTL_IN_ZS"])
df_new.shape

(113, 13)

## 9. Bonus: Considering **'unusual' missing values** like 'NOT AVAILABLE' (in *SP_DYN_LE00_IN*) and -99 (in *CM_MKT_LCAP_CD*), produce a DataFrame with all **rows** removed that have at least one missing value.

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   countryname         145 non-null    object        
 1   countrycode         154 non-null    object        
 2   year                154 non-null    int64         
 3   pub_date            154 non-null    datetime64[ns]
 4   euro_area           81 non-null     object        
 5   incomegroup_and_id  154 non-null    object        
 6   CM_MKT_LCAP_CD      154 non-null    object        
 7   SP_DYN_LE00_IN      154 non-null    object        
 8   SP_URB_TOTL_IN_ZS   118 non-null    float64       
 9   pub_date2           154 non-null    object        
 10  pub_date3           154 non-null    object        
 11  pub_date4           154 non-null    int64         
 12  missing_values      154 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object

In [16]:
# note that what seems to be "NA" actually contains leading spaces!
df["SP_DYN_LE00_IN"].unique()

array(['NOT AVAILABLE', '74.34283', '81.13659', '80.93659', '81.49024',
       '      NA', '81.84390', '80.98293', '80.58537', '81.28780',
       '80.38537', '80.58780', '74.16341', '75.96293', '81.15268',
       '81.67780', '80.57244', '80.79732', '81.01220', '82.13763',
       '81.56244', '81.44878', '83.19756', '82.69512', '82.79756',
       '82.69756', '79.76227', '80.30710', '79.57151', '79.94978',
       '80.49024', '80.43659', '81.09024', '80.53902', '82.42683',
       '83.22927', '82.47561', '83.07805', '82.11463', '81.96829',
       '82.67073', '74.35271', '74.18205', '81.43659', '80.63415',
       '81.58780', '81.28537', '80.73171', '76.77561', '75.96098',
       '75.56585', '75.06341', '75.76341', '74.85854', '81.00000',
       '80.74634', '81.50244', '80.84634', '81.34878', '82.69024',
       '82.18780', '83.09024', '82.23902', '72.00000', '68.98000',
       '71.62000', '70.00244', '82.22927', '80.98780', '81.80000',
       '74.87451', '75.03127', '75.39132', '75.70507', '8

In [17]:
# first, create a copy of df (without .copy() you might change df as well as df_new!)
df_new = df.copy()

# then remove trailing spaces in the problematic column
df_new["SP_DYN_LE00_IN"] = df["SP_DYN_LE00_IN"].str.strip()
df_new["SP_DYN_LE00_IN"].unique()

array(['NOT AVAILABLE', '74.34283', '81.13659', '80.93659', '81.49024',
       'NA', '81.84390', '80.98293', '80.58537', '81.28780', '80.38537',
       '80.58780', '74.16341', '75.96293', '81.15268', '81.67780',
       '80.57244', '80.79732', '81.01220', '82.13763', '81.56244',
       '81.44878', '83.19756', '82.69512', '82.79756', '82.69756',
       '79.76227', '80.30710', '79.57151', '79.94978', '80.49024',
       '80.43659', '81.09024', '80.53902', '82.42683', '83.22927',
       '82.47561', '83.07805', '82.11463', '81.96829', '82.67073',
       '74.35271', '74.18205', '81.43659', '80.63415', '81.58780',
       '81.28537', '80.73171', '76.77561', '75.96098', '75.56585',
       '75.06341', '75.76341', '74.85854', '81.00000', '80.74634',
       '81.50244', '80.84634', '81.34878', '82.69024', '82.18780',
       '83.09024', '82.23902', '72.00000', '68.98000', '71.62000',
       '70.00244', '82.22927', '80.98780', '81.80000', '74.87451',
       '75.03127', '75.39132', '75.70507', '81.1048

In [18]:
# now replace the problematic values with pd.NA
df_new["SP_DYN_LE00_IN"] = df_new["SP_DYN_LE00_IN"].replace(["NOT AVAILABLE", "NA"], pd.NA)
df_new["CM_MKT_LCAP_CD"] = df_new["CM_MKT_LCAP_CD"].replace("-99", pd.NA)
df_new.head(2)

Unnamed: 0,countryname,countrycode,year,pub_date,euro_area,incomegroup_and_id,CM_MKT_LCAP_CD,SP_DYN_LE00_IN,SP_URB_TOTL_IN_ZS,pub_date2,pub_date3,pub_date4,missing_values
0,,ARM,2012,2013-10-04,,Lower middle income|3,,,,2013.10.4,2013/4/10,41013,3
1,Armenia,ARM,2011,2012-08-20,,Lower middle income|3,,74.34283,,2012.8.20,2012/20/8,200812,2


In [19]:
# then dropna
df_new = df_new.dropna()
df_new.shape

(38, 13)

# Exercise 2

Continue withe the data from the the previous exercise (i.e. the first sheet of the Excel file "wdi_wrong_types.xlsx" loaded into a pandas DataFrame).

1. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* using the backwards method.
2. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* using the forwards method.
3. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* with the grand mean.
4. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* with the country-specific median.

## 1. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* using the backwards method.

In [20]:
df[["countrycode", "year", "SP_URB_TOTL_IN_ZS"]].head(5)

Unnamed: 0,countrycode,year,SP_URB_TOTL_IN_ZS
0,ARM,2012,
1,ARM,2011,
2,AUT,2013,65.884
3,AUT,2012,
4,AUT,2014,65.919


In [21]:
df_new = df.copy().sort_values(["countrycode", "year"])
df_new[["countrycode", "year", "SP_URB_TOTL_IN_ZS"]].head(5)

Unnamed: 0,countrycode,year,SP_URB_TOTL_IN_ZS
1,ARM,2011,
0,ARM,2012,
7,AUT,2011,65.858
3,AUT,2012,
2,AUT,2013,65.884


In [22]:
# Without groupby, we would produce unwanted results for Armenia and Austria in 2012, for example (need to sort and groupby!)
df_new["SP_URB_TOTL_IN_ZS"] = df_new.groupby("countrycode")["SP_URB_TOTL_IN_ZS"].bfill()
df_new[["countrycode", "year", "SP_URB_TOTL_IN_ZS"]].head(5)

Unnamed: 0,countrycode,year,SP_URB_TOTL_IN_ZS
1,ARM,2011,
0,ARM,2012,
7,AUT,2011,65.858
3,AUT,2012,65.884
2,AUT,2013,65.884


## 2. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* using the forwards method.

In [23]:
# Without groupby, we would produce unwanted results for Armenia and Austria in 2012, for example (need to sort and groupby!)
df_new = df.sort_values(["countrycode", "year"]).copy()
print(f"Missing before ffill: {df_new['SP_URB_TOTL_IN_ZS'].isna().sum()}")
df_new["SP_URB_TOTL_IN_ZS"] = df_new.groupby("countrycode")["SP_URB_TOTL_IN_ZS"].ffill()
print(f"Missing after ffill: {df_new['SP_URB_TOTL_IN_ZS'].isna().sum()}")
df_new[["countrycode", "year", "SP_URB_TOTL_IN_ZS"]].head(5)

Missing before ffill: 36
Missing after ffill: 15


Unnamed: 0,countrycode,year,SP_URB_TOTL_IN_ZS
1,ARM,2011,
0,ARM,2012,
7,AUT,2011,65.858
3,AUT,2012,65.858
2,AUT,2013,65.884


## 3. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* with the grand mean.

In [24]:
df_new = df.copy()
df_new["SP_URB_TOTL_IN_ZS"] = df["SP_URB_TOTL_IN_ZS"].fillna(df["SP_URB_TOTL_IN_ZS"].mean())
df_new[["countrycode", "year", "SP_URB_TOTL_IN_ZS"]].head(5)

Unnamed: 0,countrycode,year,SP_URB_TOTL_IN_ZS
0,ARM,2012,73.425025
1,ARM,2011,73.425025
2,AUT,2013,65.884
3,AUT,2012,73.425025
4,AUT,2014,65.919


## 4. Create a copy of the DataFrame, filling missing values in the column *SP_URB_TOTL_IN_ZS* with the country-specific median.

In [25]:
df_new = df.copy()
df_new["SP_URB_TOTL_IN_ZS"] = df["SP_URB_TOTL_IN_ZS"].fillna(
    df.groupby("countrycode")["SP_URB_TOTL_IN_ZS"].transform("median")
)
df_new[["countrycode", "year", "SP_URB_TOTL_IN_ZS"]].head(5)

Unnamed: 0,countrycode,year,SP_URB_TOTL_IN_ZS
0,ARM,2012,
1,ARM,2011,
2,AUT,2013,65.884
3,AUT,2012,65.919
4,AUT,2014,65.919
