# CO<sub>2</sub> emissions
_by Virginia Herrero_

## Data loading
Load the CSV file **co2-emissions-dataset** as a pandas DataFrame.

In [1]:
# Import all required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Load the dataset
carbon_emission = pd.read_csv("co2-emissions-dataset.csv")
carbon_emission.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_cumulative_other_co2,share_global_flaring_co2,share_global_gas_co2,share_global_luc_co2,share_global_oil_co2,share_global_other_co2,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,0.121,,,,,,
1,Afghanistan,1851,AFG,3769828.0,,,,,,,...,,,,0.118,,,,,,
2,Afghanistan,1852,AFG,3787706.0,,,,,,,...,,,,0.116,,,,,,
3,Afghanistan,1853,AFG,3806634.0,,,,,,,...,,,,0.115,,,,,,
4,Afghanistan,1854,AFG,3825655.0,,,,,,,...,,,,0.114,,,,,,


## Data cleaning
Clean and pre-process the dataset before it undergoes further analysis.

In [3]:
carbon_emission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46523 entries, 0 to 46522
Data columns (total 74 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   country                                    46523 non-null  object 
 1   year                                       46523 non-null  int64  
 2   iso_code                                   39862 non-null  object 
 3   population                                 38574 non-null  float64
 4   gdp                                        14551 non-null  float64
 5   cement_co2                                 24974 non-null  float64
 6   cement_co2_per_capita                      22714 non-null  float64
 7   co2                                        31349 non-null  float64
 8   co2_growth_abs                             28944 non-null  float64
 9   co2_growth_prct                            25032 non-null  float64
 10  co2_including_luc     

* **Remove unnecessary columns**

All columns or features that are not necessary for this analysis are dropped from the dataset.

In [4]:
carbon_emission.columns

Index(['country', 'year', 'iso_code', 'population', 'gdp', 'cement_co2',
       'cement_co2_per_capita', 'co2', 'co2_growth_abs', 'co2_growth_prct',
       'co2_including_luc', 'co2_including_luc_growth_abs',
       'co2_including_luc_growth_prct', 'co2_including_luc_per_capita',
       'co2_including_luc_per_gdp', 'co2_including_luc_per_unit_energy',
       'co2_per_capita', 'co2_per_gdp', 'co2_per_unit_energy', 'coal_co2',
       'coal_co2_per_capita', 'consumption_co2', 'consumption_co2_per_capita',
       'consumption_co2_per_gdp', 'cumulative_cement_co2', 'cumulative_co2',
       'cumulative_co2_including_luc', 'cumulative_coal_co2',
       'cumulative_flaring_co2', 'cumulative_gas_co2', 'cumulative_luc_co2',
       'cumulative_oil_co2', 'cumulative_other_co2', 'energy_per_capita',
       'energy_per_gdp', 'flaring_co2', 'flaring_co2_per_capita', 'gas_co2',
       'gas_co2_per_capita', 'ghg_excluding_lucf_per_capita', 'ghg_per_capita',
       'land_use_change_co2', 'land_use_chang

In [5]:
# Select the columns to drop
columns_to_drop = ["iso_code", "cement_co2_per_capita", "co2_growth_prct", "co2_including_luc_growth_prct", "co2_including_luc", 
                   "co2_including_luc_growth_abs", "co2_including_luc_per_capita", "co2_including_luc_per_gdp", "co2_including_luc_per_unit_energy",
                   "co2_per_capita", "co2_per_unit_energy", "coal_co2_per_capita", "consumption_co2", "consumption_co2_per_capita", "consumption_co2_per_gdp",
                   "cumulative_cement_co2", "cumulative_co2", "cumulative_co2_including_luc", "cumulative_coal_co2", "cumulative_flaring_co2",
                   "cumulative_gas_co2", "cumulative_luc_co2", "cumulative_oil_co2", "cumulative_other_co2", "energy_per_capita", "energy_per_gdp",
                   "flaring_co2_per_capita", "gas_co2_per_capita", "ghg_excluding_lucf_per_capita", "ghg_per_capita", "land_use_change_co2_per_capita",
                   "methane", "methane_per_capita", "nitrous_oxide", "nitrous_oxide_per_capita", "oil_co2_per_capita", "other_co2_per_capita",
                   "primary_energy_consumption", "share_global_cement_co2", "share_global_co2", "share_global_co2_including_luc", "share_global_coal_co2", 
                   "share_global_cumulative_cement_co2", "share_global_cumulative_co2", "share_global_cumulative_co2_including_luc", "share_global_cumulative_coal_co2",
                   "share_global_cumulative_flaring_co2", "share_global_cumulative_gas_co2", "share_global_cumulative_luc_co2", "share_global_cumulative_oil_co2", 
                   "share_global_cumulative_other_co2", "share_global_flaring_co2", "share_global_gas_co2", "share_global_luc_co2", "share_global_oil_co2", 
                   "share_global_other_co2","total_ghg", "total_ghg_excluding_lucf", "trade_co2", "trade_co2_share", "co2_growth_abs", "co2_per_gdp"]

In [6]:
carbon_emission = carbon_emission.drop(columns_to_drop, axis = 1)

In [7]:
carbon_emission.columns

Index(['country', 'year', 'population', 'gdp', 'cement_co2', 'co2', 'coal_co2',
       'flaring_co2', 'gas_co2', 'land_use_change_co2', 'oil_co2',
       'other_industry_co2'],
      dtype='object')

In [8]:
carbon_emission.head()

Unnamed: 0,country,year,population,gdp,cement_co2,co2,coal_co2,flaring_co2,gas_co2,land_use_change_co2,oil_co2,other_industry_co2
0,Afghanistan,1850,3752993.0,,,,,,,2.931,,
1,Afghanistan,1851,3769828.0,,,,,,,2.968,,
2,Afghanistan,1852,3787706.0,,,,,,,2.968,,
3,Afghanistan,1853,3806634.0,,,,,,,3.004,,
4,Afghanistan,1854,3825655.0,,,,,,,3.004,,


* **Rename columns**

Some column names are renamed to improve readabily and comprehension of the dataset.

In [9]:
carbon_emission = carbon_emission.rename(columns = {"co2" : "total_co2"})

In [10]:
carbon_emission.head()

Unnamed: 0,country,year,population,gdp,cement_co2,total_co2,coal_co2,flaring_co2,gas_co2,land_use_change_co2,oil_co2,other_industry_co2
0,Afghanistan,1850,3752993.0,,,,,,,2.931,,
1,Afghanistan,1851,3769828.0,,,,,,,2.968,,
2,Afghanistan,1852,3787706.0,,,,,,,2.968,,
3,Afghanistan,1853,3806634.0,,,,,,,3.004,,
4,Afghanistan,1854,3825655.0,,,,,,,3.004,,


* **Filter data from the last 50 years**

This study focuses on carbon dioxide emissions over the last 50 years, so the dataset will be filtered to obtain only data from 1971 to 2021.

In [11]:
# Select the first year of the dataset
carbon_emission.year.min()

np.int64(1750)

In [12]:
# Filter the dataset by year and reset the index
carbon_emission = carbon_emission[(carbon_emission["year"] >= 1971)].reset_index(drop = True)

In [13]:
carbon_emission.head()

Unnamed: 0,country,year,population,gdp,cement_co2,total_co2,coal_co2,flaring_co2,gas_co2,land_use_change_co2,oil_co2,other_industry_co2
0,Afghanistan,1971,11015853.0,15770510000.0,0.043,1.894,0.359,0.304,0.44,6.265,0.747,
1,Afghanistan,1972,11286753.0,13170550000.0,0.046,1.53,0.191,0.366,0.3,6.046,0.627,
2,Afghanistan,1973,11575308.0,13568630000.0,0.067,1.635,0.311,0.223,0.333,4.983,0.702,
3,Afghanistan,1974,11869881.0,14309110000.0,0.07,1.913,0.305,0.367,0.4,4.617,0.771,
4,Afghanistan,1975,12157390.0,15177770000.0,0.069,2.121,0.399,0.304,0.475,4.14,0.874,


* **Data types**

Verify that all columns or features have the correct data type.

In [14]:
carbon_emission.dtypes

country                 object
year                     int64
population             float64
gdp                    float64
cement_co2             float64
total_co2              float64
coal_co2               float64
flaring_co2            float64
gas_co2                float64
land_use_change_co2    float64
oil_co2                float64
other_industry_co2     float64
dtype: object

The “year” column is of integer type instead of datetime. However, since only the year is needed, not the full date, the data type will remain integer.

* **Null values**

Remove missing values, zero values, or NaN values from the dataset.

A quick glance at the descriptive statistics of the dataset indicates that 25% of the values of 5 features are zero, and 50% of the values in the flaring_co2 column are zero. 

In [15]:
carbon_emission.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,13193.0,1996.076,14.71396,1971.0,1983.0,1996.0,2009.0,2021.0
population,12106.0,97891660.0,509308500.0,1833.0,438325.8,4798528.0,18717190.0,7909295000.0
gdp,7780.0,423150700000.0,2840284000000.0,112038000.0,13070950000.0,42610610000.0,200434500000.0,113630200000000.0
cement_co2,11635.0,16.10971,91.00006,0.0,0.0,0.309,2.317,1672.592
total_co2,12419.0,713.0162,2698.844,0.0,0.9135,9.687,105.0105,37123.85
coal_co2,11747.0,199.5345,995.8156,0.0,0.0,0.121,11.88,15051.51
flaring_co2,11747.0,6.401901,27.92217,0.0,0.0,0.0,0.5555,439.254
gas_co2,11747.0,97.19472,439.8783,0.0,0.0,0.029,11.5885,7921.83
land_use_change_co2,11679.0,112.5068,497.0239,-323.861,0.0,1.319,19.346,7104.862
oil_co2,11798.0,197.7634,869.7925,0.0,0.67175,4.3145,33.43125,12345.65


In [16]:
# Check the total of null values in each column
carbon_emission.isnull().sum()

country                    0
year                       0
population              1087
gdp                     5413
cement_co2              1558
total_co2                774
coal_co2                1446
flaring_co2             1446
gas_co2                 1446
land_use_change_co2     1514
oil_co2                 1395
other_industry_co2     11108
dtype: int64

To deal with these null values, interpolation will be used. This method is chosen because it allows the time series of the data frame to be maintained by imputing interpolated values from the dataset.

In [17]:
# Replace all nan values into null values
carbon_emission = carbon_emission.replace(np.nan, 0)

In [18]:
# Interpolate all the null values
carbon_emission = carbon_emission.infer_objects(copy = False)

In [19]:
# Check for null values after interpolation
carbon_emission.isnull().sum()

country                0
year                   0
population             0
gdp                    0
cement_co2             0
total_co2              0
coal_co2               0
flaring_co2            0
gas_co2                0
land_use_change_co2    0
oil_co2                0
other_industry_co2     0
dtype: int64

In [20]:
carbon_emission.head()

Unnamed: 0,country,year,population,gdp,cement_co2,total_co2,coal_co2,flaring_co2,gas_co2,land_use_change_co2,oil_co2,other_industry_co2
0,Afghanistan,1971,11015853.0,15770510000.0,0.043,1.894,0.359,0.304,0.44,6.265,0.747,0.0
1,Afghanistan,1972,11286753.0,13170550000.0,0.046,1.53,0.191,0.366,0.3,6.046,0.627,0.0
2,Afghanistan,1973,11575308.0,13568630000.0,0.067,1.635,0.311,0.223,0.333,4.983,0.702,0.0
3,Afghanistan,1974,11869881.0,14309110000.0,0.07,1.913,0.305,0.367,0.4,4.617,0.771,0.0
4,Afghanistan,1975,12157390.0,15177770000.0,0.069,2.121,0.399,0.304,0.475,4.14,0.874,0.0


* **Duplicated values**

Check if there are any duplicated data in the dataset.

In [21]:
carbon_emission.duplicated().sum()

np.int64(0)

* **Create a new DataFrame with continent information**

Data from continents and countries are mixed in the “country” column. Therefore, to extract information on emissions from the different continents, a new data frame is created with data from each continent.

In [22]:
continents = ["Africa", "Antarctica", "Asia", "Europe", "North America", "Oceania", "South America"]
emissions_continents = carbon_emission.loc[carbon_emission.country.isin(continents)]
emissions_continents = emissions_continents.drop(columns = ["gdp"])
emissions_continents = emissions_continents.rename(columns = {"country" : "continent"})

emissions_continents.head()

Unnamed: 0,continent,year,population,cement_co2,total_co2,coal_co2,flaring_co2,gas_co2,land_use_change_co2,oil_co2,other_industry_co2
51,Africa,1971,375086296.0,9.243,332.204,149.906,55.976,1.693,959.162,115.386,0.0
52,Africa,1972,384930776.0,9.762,352.239,153.924,64.65,4.414,1141.446,119.489,0.0
53,Africa,1973,395212420.0,10.328,378.421,154.291,80.006,5.36,900.868,128.435,0.0
54,Africa,1974,406069208.0,10.803,388.865,157.241,81.362,5.111,891.341,134.347,0.0
55,Africa,1975,417556976.0,10.928,387.829,162.187,64.309,6.422,884.893,143.982,0.0


In [23]:
emissions_continents.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,327.0,1996.064,14.34561,1971.0,1984.0,1996.0,2008.0,2021.0
population,327.0,907040500.0,1190509000.0,0.0,244497700.0,492319200.0,743908500.0,4693332000.0
cement_co2,327.0,124.7931,245.8153,0.0,13.1575,48.401,115.199,1358.534
total_co2,327.0,3898.526,4460.648,0.004,457.6025,1251.306,6428.63,21688.99
coal_co2,327.0,1558.27,2305.254,0.0,125.118,423.78,2040.023,11959.01
flaring_co2,327.0,48.63087,42.28868,0.0,18.1665,40.514,69.13,230.375
gas_co2,327.0,703.8774,784.9246,0.0,64.049,239.098,1358.621,3242.854
land_use_change_co2,327.0,983.2504,966.1938,-323.861,139.195,724.116,1716.382,4117.86
oil_co2,327.0,1435.097,1353.463,0.004,206.3435,607.46,2760.164,4806.574
other_industry_co2,327.0,27.85869,45.63874,0.0,0.0,4.647,34.2675,206.051


* **Drop unnecessary countries**

The country column contains countries and areas that are not of interest for this analysis. Therefore, they will be removed from the dataset.

In [24]:
carbon_emission.country.unique()

array(['Afghanistan', 'Africa', 'Africa (GCP)', 'Aland Islands',
       'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola',
       'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Asia', 'Asia (GCP)',
       'Asia (excl. China and India)', 'Australia', 'Austria',
       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan',
       'Bolivia', 'Bonaire Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'British Virgin Islands', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Central African Republic', 'Central America (GCP)', 'Chad',
       'Chile', 'China', 'Christmas Island', 'Colombia', 'Comoros',
       'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmar

In [25]:
# Select countries to drop
countries_to_drop = ["Africa", "Africa (GCP)", "Asia", "Asia (GCP)", "Asia (excl. China and India)", "Central America (GCP)", "Europe", "Europe (GCP)", 
                     "Europe (excl. EU-27)", "Europe (excl. EU-28)","European Union (27)", "European Union (27) (GCP)", "European Union (28)",
                     "High-income countries", "International transport", "Kuwaiti Oil Fires (GCP)", "Low-income countries", "Lower-middle-income countries", 
                     "Middle East (GCP)", "Non-OECD (GCP)", "North America", "North America (GCP)", "North America (excl. USA)", "OECD (GCP)", "Oceania", 
                     "Oceania (GCP)", "Panama Canal Zone (GCP)", "Ryukyu Islands (GCP)", "South America", "South America (GCP)", "Upper-middle-income countries", "World"]

In [26]:
# Find the row index of each country and drop them from the DataFrame
for country in countries_to_drop:
    row_index = carbon_emission.index[carbon_emission["country"] == country].to_list()
    carbon_emission = carbon_emission.drop(row_index)

In [27]:
# Check that the countries have been sucessfully dropped
carbon_emission.shape

(11702, 12)

**The clean dataset**

In [28]:
carbon_emission.head()

Unnamed: 0,country,year,population,gdp,cement_co2,total_co2,coal_co2,flaring_co2,gas_co2,land_use_change_co2,oil_co2,other_industry_co2
0,Afghanistan,1971,11015853.0,15770510000.0,0.043,1.894,0.359,0.304,0.44,6.265,0.747,0.0
1,Afghanistan,1972,11286753.0,13170550000.0,0.046,1.53,0.191,0.366,0.3,6.046,0.627,0.0
2,Afghanistan,1973,11575308.0,13568630000.0,0.067,1.635,0.311,0.223,0.333,4.983,0.702,0.0
3,Afghanistan,1974,11869881.0,14309110000.0,0.07,1.913,0.305,0.367,0.4,4.617,0.771,0.0
4,Afghanistan,1975,12157390.0,15177770000.0,0.069,2.121,0.399,0.304,0.475,4.14,0.874,0.0
