# Stringency Index Data Cleaning

### Data Sources
- Stringency Index Dataset: https://github.com/OxCGRT/covid-policy-tracker/blob/master/data/timeseries/stringency_index_avg.csv

### Importing Libraries:
`country_converter`: https://pypi.org/project/country-converter/

In [1]:
import pandas as pd
import numpy as np
import country_converter as coco

### 1. Load Dataset:

In [2]:
path_import = r"raw_data/stringency_index_avg.csv"
df_csv = pd.read_csv(path_import)

In [3]:
# Create copy of dataset to work with
df = df_csv.copy()

df.head(15)

Unnamed: 0.1,Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,...,11Feb2023,12Feb2023,13Feb2023,14Feb2023,15Feb2023,16Feb2023,17Feb2023,18Feb2023,19Feb2023,20Feb2023
0,1,ABW,Aruba,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,2,AFG,Afghanistan,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3,AGO,Angola,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,4,ALB,Albania,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,5,AND,Andorra,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
5,6,ARE,United Arab Emirates,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
6,7,ARG,Argentina,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,8,AUS,Australia,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
8,9,AUS,Australia,AUS_ACT,Australian Capital Territory,STATE_TOTAL,,,,,...,,,,,,,,,,
9,10,AUS,Australia,AUS_NSW,New South Wales,STATE_TOTAL,,,,,...,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Columns: 1153 entries, Unnamed: 0 to 20Feb2023
dtypes: float64(1147), int64(1), object(5)
memory usage: 2.3+ MB


### 2. Drop State-Specific Rows:

In [5]:
# Keep all rows with national total data
df_filtered = df[df["jurisdiction"] == "NAT_TOTAL"]
df_filtered.head(15)

Unnamed: 0.1,Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,...,11Feb2023,12Feb2023,13Feb2023,14Feb2023,15Feb2023,16Feb2023,17Feb2023,18Feb2023,19Feb2023,20Feb2023
0,1,ABW,Aruba,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,2,AFG,Afghanistan,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3,AGO,Angola,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,4,ALB,Albania,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,5,AND,Andorra,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
5,6,ARE,United Arab Emirates,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
6,7,ARG,Argentina,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,8,AUS,Australia,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
16,17,AUT,Austria,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,
17,18,AZE,Azerbaijan,,,NAT_TOTAL,0.0,0.0,0.0,0.0,...,,,,,,,,,,


### 3. Check Duplicates:

In [6]:
df_filtered[df_filtered.duplicated()]

Unnamed: 0.1,Unnamed: 0,country_code,country_name,region_code,region_name,jurisdiction,01Jan2020,02Jan2020,03Jan2020,04Jan2020,...,11Feb2023,12Feb2023,13Feb2023,14Feb2023,15Feb2023,16Feb2023,17Feb2023,18Feb2023,19Feb2023,20Feb2023


### 4. Check Whitespace:

In [7]:
df_check = df_filtered.copy()

df_check["country_name"] = df_filtered["country_name"].str.strip()

### 5. Drop Unnecessary Columns:
- Drop the following columns: `Unnamed: 0`, `country_code`, `region_code`, `region_name`, `jurisdiction`.

In [8]:
cols_to_drop = [
    "Unnamed: 0",
    "country_code",
    "region_code",
    "region_name",
    "jurisdiction",
]

df_drop_cols = df_check.drop(columns=cols_to_drop)
df_drop_cols

Unnamed: 0,country_name,01Jan2020,02Jan2020,03Jan2020,04Jan2020,05Jan2020,06Jan2020,07Jan2020,08Jan2020,09Jan2020,...,11Feb2023,12Feb2023,13Feb2023,14Feb2023,15Feb2023,16Feb2023,17Feb2023,18Feb2023,19Feb2023,20Feb2023
0,Aruba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,Angola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,Vanuatu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
259,Yemen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
260,South Africa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
261,Zambia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


### 6. Convert Date Column Names to `datetime`:

In [9]:
# Checks and converts column names to datetime if possible
def convert_dates(column_name):
    try:
        return pd.to_datetime(column_name, format="%d%b%Y").date()
    except (ValueError, TypeError):
        return column_name

In [10]:
# Check output
[convert_dates(col) for col in df_drop_cols.columns]

['country_name',
 datetime.date(2020, 1, 1),
 datetime.date(2020, 1, 2),
 datetime.date(2020, 1, 3),
 datetime.date(2020, 1, 4),
 datetime.date(2020, 1, 5),
 datetime.date(2020, 1, 6),
 datetime.date(2020, 1, 7),
 datetime.date(2020, 1, 8),
 datetime.date(2020, 1, 9),
 datetime.date(2020, 1, 10),
 datetime.date(2020, 1, 11),
 datetime.date(2020, 1, 12),
 datetime.date(2020, 1, 13),
 datetime.date(2020, 1, 14),
 datetime.date(2020, 1, 15),
 datetime.date(2020, 1, 16),
 datetime.date(2020, 1, 17),
 datetime.date(2020, 1, 18),
 datetime.date(2020, 1, 19),
 datetime.date(2020, 1, 20),
 datetime.date(2020, 1, 21),
 datetime.date(2020, 1, 22),
 datetime.date(2020, 1, 23),
 datetime.date(2020, 1, 24),
 datetime.date(2020, 1, 25),
 datetime.date(2020, 1, 26),
 datetime.date(2020, 1, 27),
 datetime.date(2020, 1, 28),
 datetime.date(2020, 1, 29),
 datetime.date(2020, 1, 30),
 datetime.date(2020, 1, 31),
 datetime.date(2020, 2, 1),
 datetime.date(2020, 2, 2),
 datetime.date(2020, 2, 3),
 datetime

In [11]:
df_date_convert = df_drop_cols.copy()

# Convert column names to datetime if possible
df_date_convert.columns = [convert_dates(col) for col in df_drop_cols.columns]
df_date_convert

Unnamed: 0,country_name,2020-01-01,2020-01-02,2020-01-03,2020-01-04,2020-01-05,2020-01-06,2020-01-07,2020-01-08,2020-01-09,...,2023-02-11,2023-02-12,2023-02-13,2023-02-14,2023-02-15,2023-02-16,2023-02-17,2023-02-18,2023-02-19,2023-02-20
0,Aruba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,Angola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,Vanuatu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
259,Yemen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
260,South Africa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
261,Zambia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


### 7. Standardize Country Names:
- Standardize country names and remove non-country entries using `country-converter` package.

In [12]:
# Creates dict that maps old name with standardized country names
def convert_country_names(country_names):
    standard_names = coco.convert(names=country_names, to="name_short")
    country_fix_dict = dict(zip(country_names, standard_names))
    return country_fix_dict

In [13]:
# Extract country names
old_country_names = df_date_convert["country_name"].unique()

# Convert country names and create dict to map old and new country names
country_dict = convert_country_names(old_country_names)

In [14]:
df_standardized = df_date_convert.copy()

# Replace old country names with standardized names
df_standardized["country_name"] = df_date_convert["country_name"].replace(country_dict)

# Filter out values 'not found' by country_converter
df_standardized = df_standardized[df_standardized["country_name"] != "not found"]

### 8. Unpivot Date Columns and Data Values:

In [15]:
# Transform df and unpivot columns
df_melted = pd.melt(
    df_standardized,
    id_vars="country_name",
    var_name="date",
    value_name="stringency_index",
)

df_melted = df_melted.sort_values(["country_name", "date"]).reset_index(drop=True)

df_melted

Unnamed: 0,country_name,date,stringency_index
0,Afghanistan,2020-01-01,0.0
1,Afghanistan,2020-01-02,0.0
2,Afghanistan,2020-01-03,0.0
3,Afghanistan,2020-01-04,0.0
4,Afghanistan,2020-01-05,0.0
...,...,...,...
214484,Zimbabwe,2023-02-16,
214485,Zimbabwe,2023-02-17,
214486,Zimbabwe,2023-02-18,
214487,Zimbabwe,2023-02-19,


### 9. Drop Rows with `NaN` Values:

In [16]:
df_cleaned = df_melted.dropna()
df_cleaned

Unnamed: 0,country_name,date,stringency_index
0,Afghanistan,2020-01-01,0.0
1,Afghanistan,2020-01-02,0.0
2,Afghanistan,2020-01-03,0.0
3,Afghanistan,2020-01-04,0.0
4,Afghanistan,2020-01-05,0.0
...,...,...,...
214433,Zimbabwe,2022-12-27,53.7
214434,Zimbabwe,2022-12-28,53.7
214435,Zimbabwe,2022-12-29,53.7
214436,Zimbabwe,2022-12-30,53.7


### 10. Rename `country_name` Column to `country`:
For consistency.

In [17]:
df_cleaned = df_cleaned.rename(columns={"country_name": "country"})
df_cleaned

Unnamed: 0,country,date,stringency_index
0,Afghanistan,2020-01-01,0.0
1,Afghanistan,2020-01-02,0.0
2,Afghanistan,2020-01-03,0.0
3,Afghanistan,2020-01-04,0.0
4,Afghanistan,2020-01-05,0.0
...,...,...,...
214433,Zimbabwe,2022-12-27,53.7
214434,Zimbabwe,2022-12-28,53.7
214435,Zimbabwe,2022-12-29,53.7
214436,Zimbabwe,2022-12-30,53.7


### 11. Export to .csv File:

In [18]:
path_export = r"cleaned_data/stringency_index.csv"
df_cleaned.to_csv(path_export)