# Data type conversions - Exercises

# Preparations

In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 500)

# Exercise

1. Load the first sheet of the Excel file "wdi_wrong_types.xlsx" into a pandas DataFrame.
2. What are the data types of the columns as determined by `read_excel()`?
3. The columns *CM_MKT_LCAP_CD* and *SP_DYN_LE00_IN* should be numeric. Perform the necessary conversions using the steps from above.
4. The column *euro_area* should be boolean. Perform the necessary conversion using the steps from above.
5. Bonus: The column *incomegroup_and_id* actually contains two pieces of information, the countries' income group name and the respective ids. The two pieces are separated with a "|" character. Create two separate columns: a string column (`object`) for the income group name and an `int` column for the id. Hint: you might consider the method `pandas.Series.str.split()`, see [https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html).

## 1. Load the first sheet of the Excel file "wdi_wrong_types.csv" into a pandas DataFrame.

In [2]:
# load the following tab-separated text file
df = pd.read_excel("../../data/raw/wdi_wrong_types.xlsx")
df.head()

Unnamed: 0,countryname,countrycode,year,pub_date,euro_area,incomegroup_and_id,CM_MKT_LCAP_CD,SP_DYN_LE00_IN,SP_URB_TOTL_IN_ZS,pub_date2,pub_date3,pub_date4
0,,ARM,2012,2013-10-04,,Lower middle income|3,-99,NOT AVAILABLE,,2013.10.4,2013/4/10,41013
1,Armenia,ARM,2011,2012-08-20,,Lower middle income|3,-99,74.34283,,2012.8.20,2012/20/8,200812
2,Austria,AUT,2013,2014-03-03,EURO,High income|1,-99,81.13659,65.884,2014.3.3,2014/3/3,30314
3,Austria,AUT,2012,2013-12-25,EURO,High income|1,-99,80.93659,,2013.12.25,2013/25/12,251213
4,Austria,AUT,2014,2015-03-22,EURO,High income|1,"9,679033e+10",81.49024,65.919,2015.3.22,2015/22/3,220315


# 2. What are the data types of the columns as determined by `read_excel()`?

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   countryname         145 non-null    object        
 1   countrycode         154 non-null    object        
 2   year                154 non-null    int64         
 3   pub_date            154 non-null    datetime64[ns]
 4   euro_area           81 non-null     object        
 5   incomegroup_and_id  154 non-null    object        
 6   CM_MKT_LCAP_CD      154 non-null    object        
 7   SP_DYN_LE00_IN      154 non-null    object        
 8   SP_URB_TOTL_IN_ZS   118 non-null    float64       
 9   pub_date2           154 non-null    object        
 10  pub_date3           154 non-null    object        
 11  pub_date4           154 non-null    int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(8)
memory usage: 14.6+ KB


# 3. The columns *CM_MKT_LCAP_CD* and *SP_DYN_LE00_IN* should be numeric. Perform the necessary conversions using the steps from above.

In [4]:
# inspect the variables of interest
df[["countryname", "year", "CM_MKT_LCAP_CD", "SP_DYN_LE00_IN"]].head(5)

Unnamed: 0,countryname,year,CM_MKT_LCAP_CD,SP_DYN_LE00_IN
0,,2012,-99,NOT AVAILABLE
1,Armenia,2011,-99,74.34283
2,Austria,2013,-99,81.13659
3,Austria,2012,-99,80.93659
4,Austria,2014,"9,679033e+10",81.49024


## 3.1 convert CM_MLTLCAP_CD

In [5]:
# 1. Explicitly replace known placeholder for missingness with the appropriate special value (usually np.nan for floats
df["CM_MKT_LCAP_CD_corrected"] = df["CM_MKT_LCAP_CD"].replace("-99", np.nan)

# 2. Note how many missing values are present before any type conversion
missing_before = df["CM_MKT_LCAP_CD"].isna().sum()
print(f"Missing before replacing '-99' with np.nan: {df['CM_MKT_LCAP_CD'].isna().sum()}")
print(f"Missing after replacing '-99' with np.nan:  {missing_before}")
# 3. Prepare the column for type conversion (e.g. replace ',' with '.' and '.' with '' when number format is "German")
df["CM_MKT_LCAP_CD_corrected"] = (
    df["CM_MKT_LCAP_CD_corrected"]
    .str.replace(".", "", regex=False)
    .str.replace(",", ".", regex=False)
)

# 4. Perform the data type conversion
df["CM_MKT_LCAP_CD_corrected"] = df["CM_MKT_LCAP_CD_corrected"].astype("float")

# 5. This can lead to additional NA entries if you have overlooked a problem:
#   check for this, comparing the number of missings with the number recorded in step 2.
mssing_after = df["CM_MKT_LCAP_CD_corrected"].isna().sum()
print(f"Missing after conversion to float: {mssing_after}")
print(f"Additional missing values: {mssing_after - missing_before}")

print(df["CM_MKT_LCAP_CD_corrected"].dtype)
df[["countryname", "year", "CM_MKT_LCAP_CD", "CM_MKT_LCAP_CD_corrected"]].head(10)

Missing before replacing '-99' with np.nan: 0
Missing after replacing '-99' with np.nan:  0
Missing after conversion to float: 37
Additional missing values: 37
float64


Unnamed: 0,countryname,year,CM_MKT_LCAP_CD,CM_MKT_LCAP_CD_corrected
0,,2012,-99,
1,Armenia,2011,-99,
2,Austria,2013,-99,
3,Austria,2012,-99,
4,Austria,2014,"9,679033e+10",96790330000.0
5,Austria,2016,"1,209767e+11",120976700000.0
6,Austria,2015,"9,607938e+10",96079380000.0
7,Austria,2011,"8,526952e+10",85269520000.0
8,Belgium,2011,-99,
9,Belgium,2016,"3,777565e+11",377756500000.0


## 3.2 convert SP_DYN_LE00_IN

In [6]:
# 1. Explicitly replace known placeholder for missingness with the appropriate special value (usually np.nan for floats
df["SP_DYN_LE00_IN_corrected"] = df["SP_DYN_LE00_IN"].replace("NOT AVAILABLE", np.nan)
df["SP_DYN_LE00_IN_corrected"] = df["SP_DYN_LE00_IN_corrected"].replace("      NA", np.nan)

# 2. Note how many missing values are present before any type conversion
missing_before = df["SP_DYN_LE00_IN"].isna().sum()
print(
    f"Missing before replacing 'NOT AVAILABLE' and '      NA' with np.nan: {df['SP_DYN_LE00_IN'].isna().sum()}"
)
print(f"Missing after replacing 'NOT AVAILABLE' and '      NA' with np.nan:  {missing_before}")
# 3. Prepare the column for type conversion (e.g. replace ',' with '.' and '.' with '' when number format is "German")
df["SP_DYN_LE00_IN_corrected"] = (
    df["SP_DYN_LE00_IN_corrected"]
    .str.replace(".", "", regex=False)
    .str.replace(",", ".", regex=False)
)

# 4. Perform the data type conversion
df["SP_DYN_LE00_IN_corrected"] = df["SP_DYN_LE00_IN_corrected"].astype("float")

# 5. This can lead to additional NA entries if you have overlooked a problem:
#   check for this, comparing the number of missings with the number recorded in step 2.
mssing_after = df["SP_DYN_LE00_IN_corrected"].isna().sum()
print(f"Missing after conversion to float: {mssing_after}")
print(f"Additional missing values: {mssing_after - missing_before}")

print(df["SP_DYN_LE00_IN_corrected"].dtype)
df[["countryname", "year", "SP_DYN_LE00_IN", "SP_DYN_LE00_IN_corrected"]].head(10)

Missing before replacing 'NOT AVAILABLE' and '      NA' with np.nan: 0
Missing after replacing 'NOT AVAILABLE' and '      NA' with np.nan:  0
Missing after conversion to float: 37
Additional missing values: 37
float64


Unnamed: 0,countryname,year,SP_DYN_LE00_IN,SP_DYN_LE00_IN_corrected
0,,2012,NOT AVAILABLE,
1,Armenia,2011,74.34283,7434283.0
2,Austria,2013,81.13659,8113659.0
3,Austria,2012,80.93659,8093659.0
4,Austria,2014,81.49024,8149024.0
5,Austria,2016,,
6,Austria,2015,81.84390,8184390.0
7,Austria,2011,80.98293,8098293.0
8,Belgium,2011,80.58537,8058537.0
9,Belgium,2016,NOT AVAILABLE,


# 4. The column *euro_area* should be boolean. Perform the necessary conversion using the steps from above.

In [7]:
df.groupby("euro_area").size()

euro_area
EURO        79
NOT EURO     2
dtype: int64

In [8]:
df["euro_area"].isna().sum()

np.int64(73)

In [9]:
# Using our own custom function
def iseuro(x):
    if pd.isna(x):
        return pd.NA
    else:
        if x == "EURO":
            return True
        else:
            return False


df["euro_area_orig"] = df["euro_area"]
df["euro_area"] = df["euro_area"].apply(iseuro)
pd.crosstab(df["euro_area"], df["euro_area_orig"])

euro_area_orig,EURO,NOT EURO
euro_area,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0,2
True,79,0


In [10]:
df["euro_area"].isna().sum()

np.int64(73)

# 5. Bonus: The column *incomegroup_and_id* actually contains two pieces of information, the countries' income group name and the respective ids. The two pieces are separated with a "|" character. Create two separate columns: a string column (`object`) for the income group name and an `int` column for the id. Hint: you might consider the method `pandas.Series.str.split()`, see [https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html).


In [11]:
df["incomegroup_and_id"].head(5)

0    Lower middle income|3
1    Lower middle income|3
2            High income|1
3            High income|1
4            High income|1
Name: incomegroup_and_id, dtype: object

In [12]:
df[["income_group", "income_group_id"]] = df["incomegroup_and_id"].str.split("|", expand=True)
df[["incomegroup_and_id", "income_group", "income_group_id"]].head(5)

Unnamed: 0,incomegroup_and_id,income_group,income_group_id
0,Lower middle income|3,Lower middle income,3
1,Lower middle income|3,Lower middle income,3
2,High income|1,High income,1
3,High income|1,High income,1
4,High income|1,High income,1


In [13]:
# finally, the id should be integer:
print(f"type of income_group_id before conversion: {df['income_group_id'].dtype}")

# convert:
df["income_group_id"] = df["income_group_id"].astype("int")
print(f"type of income_group_id after conversion: {df['income_group_id'].dtype}")

type of income_group_id before conversion: object
type of income_group_id after conversion: int64
