# 6.1 Data Wrangling, Cleaning & Summary

## Table of contents:
### 1.Importing Libraries
### 2. Importing Data
### 3. Data Wrangling
#### 3.1 Dropping Columns
#### 3.2 Checking Datatypes
#### 3.3 Checking for Missing Values
#### 3.4 Checking the frequencies for some columns
#### 3.5 Rounding Decimal Numbers
### 4. Consistency Checks
#### 4.1 Checking for mixed-type data columns
#### 4.2 Checking for Duplicates
### 5. Descriptive Statistics
### 6. Creating a Subset
#### 6.1 Checking the Subset for missing values
#### 6.2 Checking the frequencies for some columns
### 7. Exporting Dataframes

# 1. Importing Libraries 


In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

# 2. Importing Data

In [2]:
# Create a path to the root directory for this project

path = r'C:\Users\osoty\Desktop\CareerFoundry Data Analytics\Data Immersion\Achievement 6\Global CO2 Emissions Analysis'

In [3]:
# Import the "owid-co2-data.csv" file without the index column

df_co2 = pd.read_csv(os.path.join(path, '02. Data', 'Original Data', 'owid-co2-data.csv'), index_col = False)

# 3. Data Wrangling

In [4]:
# Check the shape of the dataset

df_co2.shape

(46523, 74)

In [5]:
# Get a summary of the dataset

df_co2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46523 entries, 0 to 46522
Data columns (total 74 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   country                                    46523 non-null  object 
 1   year                                       46523 non-null  int64  
 2   iso_code                                   39862 non-null  object 
 3   population                                 38574 non-null  float64
 4   gdp                                        14551 non-null  float64
 5   cement_co2                                 24974 non-null  float64
 6   cement_co2_per_capita                      22714 non-null  float64
 7   co2                                        31349 non-null  float64
 8   co2_growth_abs                             28944 non-null  float64
 9   co2_growth_prct                            25032 non-null  float64
 10  co2_including_luc     

In [6]:
# Show all the columns in the dataframe

pd.set_option('display.max_columns', None)

In [7]:
# Print the first 10 rows of the dataframe

df_co2.head(10)

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,co2_including_luc,co2_including_luc_growth_abs,co2_including_luc_growth_prct,co2_including_luc_per_capita,co2_including_luc_per_gdp,co2_including_luc_per_unit_energy,co2_per_capita,co2_per_gdp,co2_per_unit_energy,coal_co2,coal_co2_per_capita,consumption_co2,consumption_co2_per_capita,consumption_co2_per_gdp,cumulative_cement_co2,cumulative_co2,cumulative_co2_including_luc,cumulative_coal_co2,cumulative_flaring_co2,cumulative_gas_co2,cumulative_luc_co2,cumulative_oil_co2,cumulative_other_co2,energy_per_capita,energy_per_gdp,flaring_co2,flaring_co2_per_capita,gas_co2,gas_co2_per_capita,ghg_excluding_lucf_per_capita,ghg_per_capita,land_use_change_co2,land_use_change_co2_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,oil_co2,oil_co2_per_capita,other_co2_per_capita,other_industry_co2,primary_energy_consumption,share_global_cement_co2,share_global_co2,share_global_co2_including_luc,share_global_coal_co2,share_global_cumulative_cement_co2,share_global_cumulative_co2,share_global_cumulative_co2_including_luc,share_global_cumulative_coal_co2,share_global_cumulative_flaring_co2,share_global_cumulative_gas_co2,share_global_cumulative_luc_co2,share_global_cumulative_oil_co2,share_global_cumulative_other_co2,share_global_flaring_co2,share_global_gas_co2,share_global_luc_co2,share_global_oil_co2,share_global_other_co2,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,,,,,,,,,,,,,,,,,,,,,2.931,,,,,,,,,,,2.931,0.781,,,,,,,,,,,,,,,,,,,,0.121,,,,,0.121,,,,,,
1,Afghanistan,1851,AFG,3769828.0,,,,,,,,,,,,,,,,,,,,,,,,,,,5.899,,,,,,,,,,,2.968,0.787,,,,,,,,,,,,,,,,,,,,0.119,,,,,0.118,,,,,,
2,Afghanistan,1852,AFG,3787706.0,,,,,,,,,,,,,,,,,,,,,,,,,,,8.867,,,,,,,,,,,2.968,0.784,,,,,,,,,,,,,,,,,,,,0.118,,,,,0.116,,,,,,
3,Afghanistan,1853,AFG,3806634.0,,,,,,,,,,,,,,,,,,,,,,,,,,,11.871,,,,,,,,,,,3.004,0.789,,,,,,,,,,,,,,,,,,,,0.118,,,,,0.115,,,,,,
4,Afghanistan,1854,AFG,3825655.0,,,,,,,,,,,,,,,,,,,,,,,,,,,14.876,,,,,,,,,,,3.004,0.785,,,,,,,,,,,,,,,,,,,,0.117,,,,,0.114,,,,,,
5,Afghanistan,1855,AFG,3844769.0,,,,,,,,,,,,,,,,,,,,,,,,,,,17.88,,,,,,,,,,,3.004,0.781,,,,,,,,,,,,,,,,,,,,0.116,,,,,0.114,,,,,,
6,Afghanistan,1856,AFG,3863976.0,,,,,,,,,,,,,,,,,,,,,,,,,,,20.921,,,,,,,,,,,3.041,0.787,,,,,,,,,,,,,,,,,,,,0.116,,,,,0.114,,,,,,
7,Afghanistan,1857,AFG,3883276.0,,,,,,,,,,,,,,,,,,,,,,,,,,,23.963,,,,,,,,,,,3.041,0.783,,,,,,,,,,,,,,,,,,,,0.115,,,,,0.113,,,,,,
8,Afghanistan,1858,AFG,3902671.0,,,,,,,,,,,,,,,,,,,,,,,,,,,27.004,,,,,,,,,,,3.041,0.779,,,,,,,,,,,,,,,,,,,,0.115,,,,,0.111,,,,,,
9,Afghanistan,1859,AFG,3922160.0,,,,,,,,,,,,,,,,,,,,,,,,,,,30.045,,,,,,,,,,,3.041,0.775,,,,,,,,,,,,,,,,,,,,0.114,,,,,0.11,,,,,,


In [8]:
# Print the last 10 rows of the dataframe

df_co2.tail(10)

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,co2_including_luc,co2_including_luc_growth_abs,co2_including_luc_growth_prct,co2_including_luc_per_capita,co2_including_luc_per_gdp,co2_including_luc_per_unit_energy,co2_per_capita,co2_per_gdp,co2_per_unit_energy,coal_co2,coal_co2_per_capita,consumption_co2,consumption_co2_per_capita,consumption_co2_per_gdp,cumulative_cement_co2,cumulative_co2,cumulative_co2_including_luc,cumulative_coal_co2,cumulative_flaring_co2,cumulative_gas_co2,cumulative_luc_co2,cumulative_oil_co2,cumulative_other_co2,energy_per_capita,energy_per_gdp,flaring_co2,flaring_co2_per_capita,gas_co2,gas_co2_per_capita,ghg_excluding_lucf_per_capita,ghg_per_capita,land_use_change_co2,land_use_change_co2_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,oil_co2,oil_co2_per_capita,other_co2_per_capita,other_industry_co2,primary_energy_consumption,share_global_cement_co2,share_global_co2,share_global_co2_including_luc,share_global_coal_co2,share_global_cumulative_cement_co2,share_global_cumulative_co2,share_global_cumulative_co2_including_luc,share_global_cumulative_coal_co2,share_global_cumulative_flaring_co2,share_global_cumulative_gas_co2,share_global_cumulative_luc_co2,share_global_cumulative_oil_co2,share_global_cumulative_other_co2,share_global_flaring_co2,share_global_gas_co2,share_global_luc_co2,share_global_oil_co2,share_global_other_co2,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
46513,Zimbabwe,2012,ZWE,13265331.0,20909970000.0,0.566,0.043,11.254,0.891,8.594,25.58,-2.993,-10.476,1.928,1.223,0.47,0.848,0.538,0.207,6.995,0.527,12.089,0.911,0.578,16.004,697.401,2863.008,562.794,0.0,0.0,2226.283,118.602,,4106.95,2.605,0.0,0.0,0.0,0.0,2.316,3.188,14.326,1.08,12.59,0.949,5.94,0.448,3.693,0.278,,,54.48,0.041,0.032,0.064,0.047,0.051,0.049,0.136,0.084,0.0,0.0,0.317,0.024,,0.0,0.0,0.297,0.032,,42.29,30.72,0.835,7.418
46514,Zimbabwe,2013,ZWE,13555420.0,21123500000.0,0.463,0.034,11.671,0.418,3.711,24.679,-0.901,-3.524,1.821,1.168,0.446,0.861,0.553,0.211,7.104,0.524,12.353,0.911,0.585,16.468,709.072,2887.687,569.899,0.0,0.0,2239.29,122.706,,4085.332,2.622,0.0,0.0,0.0,0.0,2.258,3.111,13.007,0.96,12.45,0.918,5.62,0.415,4.104,0.303,,,55.378,0.032,0.033,0.062,0.047,0.05,0.049,0.134,0.083,0.0,0.0,0.317,0.024,,0.0,0.0,0.281,0.035,,42.17,30.61,0.682,5.842
46515,Zimbabwe,2014,ZWE,13855758.0,21222500000.0,0.496,0.036,11.946,0.275,2.354,23.451,-1.227,-4.974,1.693,1.105,0.429,0.862,0.563,0.219,7.732,0.558,12.936,0.934,0.61,16.964,721.019,2911.138,577.631,0.0,0.0,2250.795,126.424,,3940.886,2.573,0.0,0.0,0.0,0.0,2.086,2.915,11.505,0.83,11.45,0.826,4.99,0.36,3.718,0.268,,,54.604,0.033,0.034,0.058,0.051,0.05,0.049,0.133,0.082,0.0,0.0,0.316,0.024,,0.0,0.0,0.234,0.032,,40.39,28.9,0.99,8.284
46516,Zimbabwe,2015,ZWE,14154937.0,21027450000.0,0.585,0.041,12.255,0.309,2.585,22.99,-0.461,-1.964,1.624,1.093,0.421,0.866,0.583,0.224,8.061,0.569,13.588,0.96,0.646,17.549,733.274,2934.128,585.692,0.0,0.0,2261.531,130.033,,3860.92,2.599,0.0,0.0,0.0,0.0,2.169,2.988,10.736,0.758,12.4,0.876,5.55,0.392,3.609,0.255,,,54.651,0.041,0.034,0.056,0.055,0.049,0.048,0.131,0.082,0.0,0.0,0.315,0.024,,0.0,0.0,0.199,0.03,,42.3,30.7,1.333,10.879
46517,Zimbabwe,2016,ZWE,14452705.0,20961790000.0,0.461,0.032,10.533,-1.722,-14.047,20.903,-2.088,-9.082,1.446,0.997,0.448,0.729,0.503,0.226,6.932,0.48,11.938,0.826,0.569,18.01,743.807,2955.031,592.624,0.0,0.0,2271.9,133.173,,3228.872,2.226,0.0,0.0,0.0,0.0,1.974,8.021,10.369,0.717,11.98,0.829,5.1,0.353,3.14,0.217,,,46.666,0.031,0.03,0.052,0.048,0.049,0.048,0.13,0.081,0.0,0.0,0.315,0.024,,0.0,0.0,0.229,0.026,,115.92,28.53,1.404,13.33
46518,Zimbabwe,2017,ZWE,14751101.0,21947840000.0,0.469,0.032,9.596,-0.937,-8.899,19.086,-1.817,-8.691,1.294,0.87,0.415,0.651,0.437,0.209,5.9,0.4,10.506,0.712,0.479,18.479,753.403,2974.117,598.525,0.0,0.0,2281.39,136.4,,3114.076,2.093,0.0,0.0,0.0,0.0,1.919,7.836,9.49,0.643,12.12,0.822,5.24,0.355,3.227,0.219,,,45.936,0.031,0.027,0.047,0.041,0.048,0.047,0.129,0.08,0.0,0.0,0.314,0.024,,0.0,0.0,0.219,0.026,,115.59,28.3,0.91,9.486
46519,Zimbabwe,2018,ZWE,15052191.0,22715350000.0,0.558,0.037,11.795,2.199,22.92,20.626,1.54,8.068,1.37,0.908,0.434,0.784,0.519,0.248,7.178,0.477,12.567,0.835,0.553,19.037,765.199,2994.742,605.703,0.0,0.0,2290.22,140.459,,3155.824,2.091,0.0,0.0,0.0,0.0,2.048,7.854,8.83,0.587,12.44,0.826,5.43,0.361,4.06,0.27,,,47.502,0.036,0.032,0.05,0.049,0.047,0.047,0.127,0.08,0.0,0.0,0.314,0.025,,0.0,0.0,0.211,0.033,,118.22,30.83,0.771,6.537
46520,Zimbabwe,2019,ZWE,15354606.0,,0.57,0.037,11.115,-0.681,-5.772,19.432,-1.194,-5.788,1.266,,0.393,0.724,,0.225,6.888,0.449,12.092,0.788,,19.606,776.313,3014.175,612.591,0.0,0.0,2298.537,144.116,,3219.013,,0.0,0.0,0.0,0.0,1.988,7.682,8.317,0.542,12.68,0.826,5.5,0.358,3.657,0.238,,,49.427,0.035,0.03,0.047,0.047,0.047,0.047,0.126,0.079,0.0,0.0,0.313,0.025,,0.0,0.0,0.183,0.03,,117.96,30.53,0.978,8.795
46521,Zimbabwe,2020,ZWE,15669663.0,,0.57,0.036,10.608,-0.507,-4.559,18.485,-0.946,-4.87,1.18,,,0.677,,,6.722,0.429,11.614,0.741,,20.176,786.921,3032.66,619.312,0.0,0.0,2306.415,147.433,,,,0.0,0.0,0.0,0.0,,,7.878,0.503,,,,,3.317,0.212,,,,0.035,0.03,0.047,0.047,0.046,0.046,0.125,0.078,0.0,0.0,0.312,0.025,,0.0,0.0,0.194,0.03,,,,1.006,9.481
46522,Zimbabwe,2021,ZWE,15993525.0,,0.57,0.036,11.296,0.688,6.488,18.771,0.285,1.543,1.174,,,0.706,,,7.226,0.452,,,,20.745,798.217,3051.431,626.538,0.0,0.0,2313.889,150.934,,,,0.0,0.0,0.0,0.0,,,7.475,0.467,,,,,3.501,0.219,,,,0.034,0.03,0.046,0.048,0.046,0.046,0.123,0.078,0.0,0.0,0.312,0.025,,0.0,0.0,0.19,0.03,,,,,


In [9]:
# Check column names to decide which columns are not needed for the analysis

df_co2.columns

Index(['country', 'year', 'iso_code', 'population', 'gdp', 'cement_co2',
       'cement_co2_per_capita', 'co2', 'co2_growth_abs', 'co2_growth_prct',
       'co2_including_luc', 'co2_including_luc_growth_abs',
       'co2_including_luc_growth_prct', 'co2_including_luc_per_capita',
       'co2_including_luc_per_gdp', 'co2_including_luc_per_unit_energy',
       'co2_per_capita', 'co2_per_gdp', 'co2_per_unit_energy', 'coal_co2',
       'coal_co2_per_capita', 'consumption_co2', 'consumption_co2_per_capita',
       'consumption_co2_per_gdp', 'cumulative_cement_co2', 'cumulative_co2',
       'cumulative_co2_including_luc', 'cumulative_coal_co2',
       'cumulative_flaring_co2', 'cumulative_gas_co2', 'cumulative_luc_co2',
       'cumulative_oil_co2', 'cumulative_other_co2', 'energy_per_capita',
       'energy_per_gdp', 'flaring_co2', 'flaring_co2_per_capita', 'gas_co2',
       'gas_co2_per_capita', 'ghg_excluding_lucf_per_capita', 'ghg_per_capita',
       'land_use_change_co2', 'land_use_chang

### There are 74 columns total. I am not going to need all of them for my analysis, so I'm going to drop the columns that I don't need.

## 3.1 Dropping Columns

In [10]:
# Remove the columns not needed for the analysis
    
df_co2 = df_co2.drop(columns = ['co2_including_luc', 'co2_including_luc_growth_abs', 'co2_including_luc_growth_prct', 'co2_including_luc_per_capita', 'co2_including_luc_per_gdp', 'co2_including_luc_per_unit_energy', 'co2_per_unit_energy', 'consumption_co2', 'consumption_co2_per_capita', 'consumption_co2_per_gdp', 'cumulative_co2_including_luc',  'cumulative_luc_co2', 'ghg_excluding_lucf_per_capita', 'ghg_per_capita', 'land_use_change_co2', 'land_use_change_co2_per_capita', 'methane', 'methane_per_capita', 'nitrous_oxide', 'nitrous_oxide_per_capita', 'share_global_co2_including_luc', 'share_global_cumulative_co2_including_luc','share_global_luc_co2', 'share_global_cumulative_luc_co2', 'total_ghg', 'total_ghg_excluding_lucf', 'trade_co2', 'trade_co2_share'])

 ### All columns including land-use change (luc) have been dropped since I'm going to focus my analysis on emissions associated with  energy and industrial production. Furthermore, I removed the columns associated with CO2 consumption since I am just going to look at production-based CO2 emissions and not at trade. Also all columns associated with other greenhouse gases have been dropped since I'm going to focus on carbon dioxide in my analysis. I also don't need the column "CO2_per_unit_energy" column for my analysis.

In [11]:
# Check the dataframe to make sure the columns have been dropped

df_co2.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,co2_per_capita,co2_per_gdp,coal_co2,coal_co2_per_capita,cumulative_cement_co2,cumulative_co2,cumulative_coal_co2,cumulative_flaring_co2,cumulative_gas_co2,cumulative_oil_co2,cumulative_other_co2,energy_per_capita,energy_per_gdp,flaring_co2,flaring_co2_per_capita,gas_co2,gas_co2_per_capita,oil_co2,oil_co2_per_capita,other_co2_per_capita,other_industry_co2,primary_energy_consumption,share_global_cement_co2,share_global_co2,share_global_coal_co2,share_global_cumulative_cement_co2,share_global_cumulative_co2,share_global_cumulative_coal_co2,share_global_cumulative_flaring_co2,share_global_cumulative_gas_co2,share_global_cumulative_oil_co2,share_global_cumulative_other_co2,share_global_flaring_co2,share_global_gas_co2,share_global_oil_co2,share_global_other_co2
0,Afghanistan,1850,AFG,3752993.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Afghanistan,1851,AFG,3769828.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Afghanistan,1852,AFG,3787706.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Afghanistan,1853,AFG,3806634.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Afghanistan,1854,AFG,3825655.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
# Check the size of the dataframe after the columns have been dropped

df_co2.shape

(46523, 46)

### 28 columns that  are not needed for the analysis have been dropped. There are 46 columns left.

## 3.2 Checking datatypes

In [13]:
# Check columns datatypes

df_co2.dtypes

country                                 object
year                                     int64
iso_code                                object
population                             float64
gdp                                    float64
cement_co2                             float64
cement_co2_per_capita                  float64
co2                                    float64
co2_growth_abs                         float64
co2_growth_prct                        float64
co2_per_capita                         float64
co2_per_gdp                            float64
coal_co2                               float64
coal_co2_per_capita                    float64
cumulative_cement_co2                  float64
cumulative_co2                         float64
cumulative_coal_co2                    float64
cumulative_flaring_co2                 float64
cumulative_gas_co2                     float64
cumulative_oil_co2                     float64
cumulative_other_co2                   float64
energy_per_ca

### Datatypes are correct.

  ## 3.3 Checking for Missing Values

In [14]:
# Check the dataframe for missing values

df_co2.isnull().sum()

country                                    0
year                                       0
iso_code                                6661
population                              7949
gdp                                    31972
cement_co2                             21549
cement_co2_per_capita                  23809
co2                                    15174
co2_growth_abs                         17579
co2_growth_prct                        21491
co2_per_capita                         19608
co2_per_gdp                            30232
coal_co2                               21454
coal_co2_per_capita                    22102
cumulative_cement_co2                  21650
cumulative_co2                         17167
cumulative_coal_co2                    21555
cumulative_flaring_co2                 21749
cumulative_gas_co2                     21681
cumulative_oil_co2                     21582
cumulative_other_co2                   44137
energy_per_capita                      36536
energy_per

In [15]:
# Check the sum of missing values

df_co2.isnull().sum().sum()

1114472

### The dataframe consists of 46523 rows and 46 columns, i.e., it contains 2.140.058 values. 1.114472 values are missing which makes up 52% of the total values. Looking at the columns with the most missing values, I decide to drop all columns dealing with CO2 for other industries to reduce the number of missing values.

In [16]:
# Remove the columns with the highest numbers of missing values
    
df_co2 = df_co2.drop(columns = ['cumulative_other_co2', 'other_co2_per_capita', 'other_industry_co2', 'share_global_cumulative_other_co2', 'share_global_other_co2'])

In [17]:
# Check the size of the dataframe after the columns have been dropped

df_co2.shape

(46523, 41)

In [18]:
# Check the dataframe for missing values after dropping the columns

df_co2.isnull().sum()

country                                    0
year                                       0
iso_code                                6661
population                              7949
gdp                                    31972
cement_co2                             21549
cement_co2_per_capita                  23809
co2                                    15174
co2_growth_abs                         17579
co2_growth_prct                        21491
co2_per_capita                         19608
co2_per_gdp                            30232
coal_co2                               21454
coal_co2_per_capita                    22102
cumulative_cement_co2                  21650
cumulative_co2                         17167
cumulative_coal_co2                    21555
cumulative_flaring_co2                 21749
cumulative_gas_co2                     21681
cumulative_oil_co2                     21582
energy_per_capita                      36536
energy_per_gdp                         39364
flaring_co

In [19]:
# Check the sum of missing values

df_co2.isnull().sum().sum()

893787

### The dataframe consists of 46523 rows and 41 columns, i.e., it contains 1.907.443 values. 893.787 values are missing which makes up 47% of the total values. These are still a lot of missing values.

## 3.4 Checking the frequencies for some columns

In [20]:
# Remove the limit to max rows displayed

pd.set_option('display.max_rows', None)

In [21]:
# Check the frequencies for the "country" column

df_co2['country'].value_counts(dropna = False)

South America                       272
Singapore                           272
Iceland                             272
United Kingdom                      272
European Union (28)                 272
Guadeloupe                          272
Cambodia                            272
Europe (excl. EU-28)                272
Tuvalu                              272
Brunei                              272
Africa                              272
Europe (excl. EU-27)                272
Europe                              272
Guinea                              272
Sierra Leone                        272
Oceania                             272
Australia                           272
Asia (excl. China and India)        272
North America (excl. USA)           272
Low-income countries                272
Martinique                          272
Mauritania                          272
World                               272
Cote d'Ivoire                       272
Lower-middle-income countries       272


In [22]:
# Count unique values in the "country" column

df_co2['country'].nunique()

269

### What is noticable here is that the "country" column not only includes countries, but also regions and continents. There are 269 unique values, the maximum recoreded observations are 272, a lot of countries have less observations.

In [23]:
# Check the frequencies for the "year" column

df_co2['year'].value_counts(dropna = False)

1991    261
1998    260
2006    260
1990    260
1992    260
1993    260
1994    260
1995    260
1996    260
1997    260
1999    260
2000    260
2001    260
2002    260
2003    260
2005    260
2004    260
2007    260
2014    260
2021    260
2020    260
2019    260
2018    260
2017    260
2008    260
2015    260
2016    260
2013    260
2012    260
2011    260
2010    260
2009    260
1972    258
1971    258
1979    257
1974    257
1976    257
1977    257
1978    257
1975    257
1970    257
1973    257
1983    256
1982    256
1981    256
1980    256
1987    256
1988    256
1989    256
1969    255
1968    255
1986    255
1985    255
1984    255
1967    254
1966    254
1965    254
1964    254
1963    253
1955    252
1962    252
1956    252
1957    252
1958    251
1961    251
1959    250
1960    250
1954    250
1953    249
1952    249
1951    248
1950    248
1949    243
1935    242
1945    242
1944    242
1943    242
1942    242
1941    242
1940    242
1939    242
1938    242
1937    242
1936

In [24]:
# Count unique values in the "year" column

df_co2['year'].nunique()

272

### There are 272 unique values, with a maximum of 261 observations. There are several years which have less observations.

## 3.5 Rounding decimal numbers

### To make the numbers more consistent all of them were rounded to 2 decimal places and population to 0 decimal places.

In [25]:
df_co2.population = df_co2.population.round()

In [26]:
df_co2.gdp = df_co2.gdp.round(2)

In [27]:
df_co2.cement_co2 = df_co2.cement_co2.round(2)

In [28]:
df_co2.cement_co2_per_capita = df_co2.cement_co2_per_capita.round(2)

In [29]:
df_co2.co2 = df_co2.co2.round(2)

In [30]:
df_co2.co2_growth_abs = df_co2.co2_growth_abs.round(2)

In [31]:
df_co2.co2_growth_prct = df_co2.co2_growth_prct.round(2)

In [32]:
df_co2.co2_per_capita = df_co2.co2_per_capita.round(2)

In [33]:
df_co2.co2_per_gdp = df_co2.co2_per_gdp.round(2)

In [34]:
df_co2.coal_co2 = df_co2.coal_co2.round(2)

In [35]:
df_co2.coal_co2_per_capita = df_co2.coal_co2_per_capita.round(2)

In [36]:
df_co2.cumulative_cement_co2 = df_co2.cumulative_cement_co2.round(2)

In [37]:
df_co2.cumulative_co2 = df_co2.cumulative_co2.round(2)

In [38]:
df_co2.cumulative_coal_co2 = df_co2.cumulative_coal_co2.round(2)

In [39]:
df_co2.cumulative_flaring_co2 = df_co2.flaring_co2.round(2)

In [40]:
df_co2.cumulative_gas_co2 = df_co2.cumulative_gas_co2.round(2)

In [41]:
df_co2.cumulative_oil_co2 = df_co2.cumulative_oil_co2.round(2)

In [42]:
df_co2.energy_per_capita = df_co2.energy_per_capita.round(2)

In [43]:
df_co2.energy_per_gdp = df_co2.energy_per_gdp.round(2)

In [44]:
df_co2.flaring_co2 = df_co2.flaring_co2.round(2)

In [45]:
df_co2.flaring_co2_per_capita = df_co2.flaring_co2_per_capita.round(2)

In [46]:
df_co2.gas_co2 = df_co2.gas_co2.round(2)

In [47]:
df_co2.gas_co2_per_capita = df_co2.gas_co2_per_capita.round(2)

In [48]:
df_co2.oil_co2 = df_co2.oil_co2.round(2)

In [49]:
df_co2.oil_co2_per_capita = df_co2.oil_co2_per_capita.round(2)

In [50]:
df_co2.primary_energy_consumption = df_co2.primary_energy_consumption.round(2)

In [51]:
df_co2.share_global_cement_co2 = df_co2.share_global_cement_co2.round(2)

In [52]:
df_co2.share_global_co2 = df_co2.share_global_co2.round(2)

In [53]:
df_co2.share_global_coal_co2 = df_co2.share_global_coal_co2.round(2)

In [54]:
df_co2.share_global_cumulative_cement_co2 = df_co2.share_global_cumulative_cement_co2.round(2)

In [55]:
df_co2.share_global_cumulative_co2 = df_co2.share_global_cumulative_co2.round(2)

In [56]:
df_co2.share_global_cumulative_coal_co2 = df_co2.share_global_cumulative_coal_co2.round(2)

In [57]:
df_co2.share_global_cumulative_flaring_co2 = df_co2.share_global_cumulative_flaring_co2.round(2)

In [58]:
df_co2.share_global_cumulative_gas_co2 = df_co2.share_global_cumulative_gas_co2.round(2)

In [59]:
df_co2.share_global_cumulative_oil_co2 = df_co2.share_global_cumulative_oil_co2.round(2)

In [60]:
df_co2.share_global_flaring_co2 = df_co2.share_global_flaring_co2.round(2)

In [61]:
df_co2.share_global_gas_co2 = df_co2.share_global_gas_co2.round(2)

In [62]:
df_co2.share_global_oil_co2 = df_co2.share_global_oil_co2.round(2)

In [63]:
# Check the result

df_co2.tail(10)

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,co2_per_capita,co2_per_gdp,coal_co2,coal_co2_per_capita,cumulative_cement_co2,cumulative_co2,cumulative_coal_co2,cumulative_flaring_co2,cumulative_gas_co2,cumulative_oil_co2,energy_per_capita,energy_per_gdp,flaring_co2,flaring_co2_per_capita,gas_co2,gas_co2_per_capita,oil_co2,oil_co2_per_capita,primary_energy_consumption,share_global_cement_co2,share_global_co2,share_global_coal_co2,share_global_cumulative_cement_co2,share_global_cumulative_co2,share_global_cumulative_coal_co2,share_global_cumulative_flaring_co2,share_global_cumulative_gas_co2,share_global_cumulative_oil_co2,share_global_flaring_co2,share_global_gas_co2,share_global_oil_co2
46513,Zimbabwe,2012,ZWE,13265331.0,20909970000.0,0.57,0.04,11.25,0.89,8.59,0.85,0.54,7.0,0.53,16.0,697.4,562.79,0.0,0.0,118.6,4106.95,2.6,0.0,0.0,0.0,0.0,3.69,0.28,54.48,0.04,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46514,Zimbabwe,2013,ZWE,13555420.0,21123500000.0,0.46,0.03,11.67,0.42,3.71,0.86,0.55,7.1,0.52,16.47,709.07,569.9,0.0,0.0,122.71,4085.33,2.62,0.0,0.0,0.0,0.0,4.1,0.3,55.38,0.03,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.04
46515,Zimbabwe,2014,ZWE,13855758.0,21222500000.0,0.5,0.04,11.95,0.28,2.35,0.86,0.56,7.73,0.56,16.96,721.02,577.63,0.0,0.0,126.42,3940.89,2.57,0.0,0.0,0.0,0.0,3.72,0.27,54.6,0.03,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46516,Zimbabwe,2015,ZWE,14154937.0,21027450000.0,0.58,0.04,12.26,0.31,2.58,0.87,0.58,8.06,0.57,17.55,733.27,585.69,0.0,0.0,130.03,3860.92,2.6,0.0,0.0,0.0,0.0,3.61,0.26,54.65,0.04,0.03,0.06,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46517,Zimbabwe,2016,ZWE,14452705.0,20961790000.0,0.46,0.03,10.53,-1.72,-14.05,0.73,0.5,6.93,0.48,18.01,743.81,592.62,0.0,0.0,133.17,3228.87,2.23,0.0,0.0,0.0,0.0,3.14,0.22,46.67,0.03,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46518,Zimbabwe,2017,ZWE,14751101.0,21947840000.0,0.47,0.03,9.6,-0.94,-8.9,0.65,0.44,5.9,0.4,18.48,753.4,598.52,0.0,0.0,136.4,3114.08,2.09,0.0,0.0,0.0,0.0,3.23,0.22,45.94,0.03,0.03,0.04,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46519,Zimbabwe,2018,ZWE,15052191.0,22715350000.0,0.56,0.04,11.8,2.2,22.92,0.78,0.52,7.18,0.48,19.04,765.2,605.7,0.0,0.0,140.46,3155.82,2.09,0.0,0.0,0.0,0.0,4.06,0.27,47.5,0.04,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46520,Zimbabwe,2019,ZWE,15354606.0,,0.57,0.04,11.12,-0.68,-5.77,0.72,,6.89,0.45,19.61,776.31,612.59,0.0,0.0,144.12,3219.01,,0.0,0.0,0.0,0.0,3.66,0.24,49.43,0.04,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46521,Zimbabwe,2020,ZWE,15669663.0,,0.57,0.04,10.61,-0.51,-4.56,0.68,,6.72,0.43,20.18,786.92,619.31,0.0,0.0,147.43,,,0.0,0.0,0.0,0.0,3.32,0.21,,0.04,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03
46522,Zimbabwe,2021,ZWE,15993525.0,,0.57,0.04,11.3,0.69,6.49,0.71,,7.23,0.45,20.74,798.22,626.54,0.0,0.0,150.93,,,0.0,0.0,0.0,0.0,3.5,0.22,,0.03,0.03,0.05,0.05,0.05,0.08,0.0,0.0,0.02,0.0,0.0,0.03


### Values are rounded correctly.

# 4. Consistency Checks

## 4.1 Checking for mixed-type data columns

In [64]:
# Check if there are mixed-type columns

for col in df_co2.columns.tolist():
  weird = (df_co2[[col]].applymap(type) != df_co2[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_co2[weird]) > 0:
    print (col)

iso_code


### The "iso_code" column has mixed-type data.

In [65]:
# Change the data type of the "iso_code" column to a string since the column contains abbreviations for country names

df_co2['iso_code'] = df_co2['iso_code'].astype('str')

## 4.2 Checking for Duplicates¶

In [66]:
# Check the dataframe for duplicates

df_dups=df_co2[df_co2.duplicated()]

In [67]:
# Test

df_dups

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,co2_per_capita,co2_per_gdp,coal_co2,coal_co2_per_capita,cumulative_cement_co2,cumulative_co2,cumulative_coal_co2,cumulative_flaring_co2,cumulative_gas_co2,cumulative_oil_co2,energy_per_capita,energy_per_gdp,flaring_co2,flaring_co2_per_capita,gas_co2,gas_co2_per_capita,oil_co2,oil_co2_per_capita,primary_energy_consumption,share_global_cement_co2,share_global_co2,share_global_coal_co2,share_global_cumulative_cement_co2,share_global_cumulative_co2,share_global_cumulative_coal_co2,share_global_cumulative_flaring_co2,share_global_cumulative_gas_co2,share_global_cumulative_oil_co2,share_global_flaring_co2,share_global_gas_co2,share_global_oil_co2


### There are no duplicates.

# 5. Descriptive Statistics

In [68]:
# Check for abnormalities in descriptive statistics

df_co2.describe()

Unnamed: 0,year,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,co2_per_capita,co2_per_gdp,coal_co2,coal_co2_per_capita,cumulative_cement_co2,cumulative_co2,cumulative_coal_co2,cumulative_flaring_co2,cumulative_gas_co2,cumulative_oil_co2,energy_per_capita,energy_per_gdp,flaring_co2,flaring_co2_per_capita,gas_co2,gas_co2_per_capita,oil_co2,oil_co2_per_capita,primary_energy_consumption,share_global_cement_co2,share_global_co2,share_global_coal_co2,share_global_cumulative_cement_co2,share_global_cumulative_co2,share_global_cumulative_coal_co2,share_global_cumulative_flaring_co2,share_global_cumulative_gas_co2,share_global_cumulative_oil_co2,share_global_flaring_co2,share_global_gas_co2,share_global_oil_co2
count,46523.0,38574.0,14551.0,24974.0,22714.0,31349.0,28944.0,25032.0,26915.0,16291.0,25069.0,24421.0,24873.0,29356.0,24968.0,24875.0,24842.0,24941.0,9987.0,7159.0,24875.0,24272.0,24943.0,24340.0,25042.0,24367.0,10085.0,20830.0,29356.0,24968.0,20830.0,29356.0,24968.0,15980.0,21990.0,23421.0,15980.0,21990.0,23421.0
mean,1925.686478,60053740.0,267997700000.0,8.386319,0.063867,379.988026,5.713787,20.490187,3.668287,0.41822,154.396128,1.095743,200.130462,10834.2,7692.326206,3.442077,1213.004871,3508.377147,25778.93664,1.809802,3.442077,0.155455,49.985359,0.528233,111.227255,2.204266,2367.241917,3.295904,4.995461,5.913856,3.326577,5.116227,6.068638,1.945057,2.759578,3.342084,2.031497,2.823079,3.325212
std,61.042693,328082800.0,2104075000000.0,63.013589,0.123098,1799.875838,58.766155,699.580069,14.947738,0.559681,751.42085,2.229621,1410.409403,64880.12,38208.689415,20.13099,8788.686975,22972.403591,36727.25089,1.780248,20.13099,2.310965,309.087687,2.286682,624.032441,14.914685,10799.647619,14.059939,18.075952,19.644598,13.956664,18.838044,20.453044,8.940426,14.078772,12.986094,8.955714,13.680785,12.789923
min,1750.0,21.0,49980000.0,0.0,0.0,0.0,-1818.47,-100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1882.0,380891.2,7530493000.0,0.0,0.0,0.12,0.0,-0.54,0.12,0.14,0.0,0.0,0.0,0.94,0.05,0.0,0.0,0.8,3164.865,0.86,0.0,0.0,0.0,0.0,0.08,0.04,6.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1930.0,2509282.0,26059000000.0,0.01,0.0,3.11,0.02,3.79,0.89,0.28,0.66,0.09,0.11,43.375,14.49,0.0,0.0,21.18,13451.91,1.31,0.0,0.0,0.0,0.0,1.24,0.38,61.44,0.03,0.03,0.02,0.03,0.02,0.01,0.0,0.0,0.05,0.0,0.0,0.06
75%,1977.0,9996447.0,113471100000.0,0.69,0.09,43.66,0.77,10.58,4.07,0.53,16.48,1.14,13.62,808.43,522.1475,0.0,12.855,275.75,36031.18,2.21,0.0,0.0,1.24,0.08,13.1,1.97,423.56,0.4,0.45,0.56,0.41,0.31,0.41,0.05,0.07,0.49,0.05,0.12,0.52
max,2021.0,7909295000.0,113630200000000.0,1672.59,2.57,37123.85,1859.76,102318.51,824.46,37.61,15051.51,34.23,45066.93,1736930.0,804190.0,439.25,254057.83,605476.75,657539.19,25.25,439.25,113.22,7921.83,53.31,12345.65,824.46,165319.69,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


### Descriptive statistics look good. It's noticeable that a lot of the variables have zero as a minimum value which could mean that values are missing or CO2 emissions maybe didn't reach the threshold to be recorded. Also, there is a huge gap between the minimum value for GDP and the maximum value, indicating huge differences between poor countries and rich countries.

# 6. Creating a Subset

In [69]:
# Define an array with the unique "year" values to be able to check the "year" column for missing values to decide how many years my analysis is going to include

years_count_missing = dict.fromkeys(df_co2['year'].unique(), 0)
for ind, row in df_co2.iterrows():
    years_count_missing[row['year']] += row.isnull().sum()

In [70]:
# Sort the years by missing values

years_missing_sorted = dict(sorted(years_count_missing.items(), key=lambda item: item[1]))

In [71]:
# Print the missing values for each year

print("missing values by year:")
for key, val in years_missing_sorted.items():
    print(key, ":", val)

missing values by year:
1790 : 849
1760 : 863
1770 : 863
1780 : 863
1750 : 903
1820 : 909
1800 : 915
1801 : 926
1803 : 926
1805 : 926
1806 : 926
1808 : 926
1809 : 926
1810 : 928
1821 : 939
1822 : 939
1823 : 939
1824 : 939
1825 : 939
1826 : 939
1827 : 939
1828 : 939
1811 : 942
1812 : 942
1813 : 942
1814 : 942
1815 : 942
1816 : 942
1817 : 942
1818 : 942
1804 : 943
1829 : 943
1807 : 944
1819 : 959
1802 : 962
1840 : 978
1751 : 982
1752 : 982
1753 : 982
1754 : 982
1755 : 982
1756 : 982
1757 : 982
1758 : 982
1759 : 982
1761 : 982
1762 : 982
1763 : 982
1764 : 982
1765 : 982
1766 : 982
1767 : 982
1768 : 982
1769 : 982
1771 : 982
1772 : 982
1773 : 982
1774 : 982
1775 : 982
1776 : 982
1777 : 982
1778 : 982
1779 : 982
1781 : 982
1782 : 982
1783 : 982
1784 : 982
1786 : 982
1787 : 982
1788 : 982
1789 : 982
1791 : 982
1835 : 985
1836 : 985
1837 : 985
1838 : 985
1785 : 986
1834 : 997
1841 : 999
1842 : 999
1839 : 1000
1848 : 1000
1849 : 1000
1845 : 1011
1844 : 1013
1843 : 1015
1847 : 1016
1846 : 1018


### Most of the values are missing for years in the 1800s and 1900s. To work with more timely data, I am also creating a subset containing only data from last 10 years (from 2012 to 2021).

In [72]:
# Create a subset that only contains data for the years 2012-2021

df_co2_recent = df_co2.loc[df_co2['year'].isin([ 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])]

In [73]:
# Check the size of the new dataframe

df_co2_recent.shape

(2600, 41)

### The new dataframe is considerably shorter (only 2600 rows).

## 6.1 Checking the Subset for missing values

In [74]:
# Check the new dataframe for missing values

df_co2_recent.isnull().sum()

country                                   0
year                                      0
iso_code                                  0
population                              198
gdp                                    1442
cement_co2                              257
cement_co2_per_capita                   257
co2                                     120
co2_growth_abs                          240
co2_growth_prct                         220
co2_per_capita                          240
co2_per_gdp                            1337
coal_co2                                240
coal_co2_per_capita                     240
cumulative_cement_co2                   257
cumulative_co2                          230
cumulative_coal_co2                     240
cumulative_flaring_co2                  240
cumulative_gas_co2                      240
cumulative_oil_co2                      230
energy_per_capita                       616
energy_per_gdp                         1442
flaring_co2                     

In [75]:
# Check the sum of missing values for the new dataframe

df_co2_recent.isnull().sum().sum()

13194

### After filtering out the years before 2012, there are 13% of missing values in the new dataframe which is still a large number of missing values, but removing them would lead to losing a lot of data.

In [76]:
# Remove the limit to max rows displayed

pd.set_option('display.max_rows', None)

## 6.2 Checking the frequencies for some columns

In [77]:
# Check the frequencies for the "country" column

df_co2_recent['country'].value_counts(dropna = False)

Afghanistan                         10
Netherlands Antilles                10
New Zealand                         10
Nicaragua                           10
Niger                               10
Nigeria                             10
Niue                                10
Non-OECD (GCP)                      10
North America                       10
North America (GCP)                 10
North America (excl. USA)           10
North Korea                         10
North Macedonia                     10
Norway                              10
OECD (GCP)                          10
Oceania                             10
Oceania (GCP)                       10
Oman                                10
Pakistan                            10
Palau                               10
Palestine                           10
Panama                              10
Papua New Guinea                    10
Paraguay                            10
Peru                                10
Philippines              

In [78]:
# Count unique values in the "country" column

df_co2_recent['country'].nunique()

260

### What is noticable here is that the "country" column not only includes countries, but also regions and continents. There are 260 unique values, each of them recorded in 10 observations. There are 9 countries/regions less than in the full dataframe. This could be due to missing data for these countries or regions or that the countries regions were obsolete in the past 10 years.

In [79]:
# Check the frequencies for the "year" column

df_co2_recent['year'].value_counts(dropna = False)

2012    260
2013    260
2014    260
2015    260
2016    260
2017    260
2018    260
2019    260
2020    260
2021    260
Name: year, dtype: int64

### There are 10 unique values, each of them recorded in 260 observations.

In [80]:
# Check the frequencies for the "iso_code" column

df_co2_recent['iso_code'].value_counts(dropna = False)

nan    300
PSE     10
NCL     10
NZL     10
NIC     10
NER     10
NGA     10
NIU     10
PRK     10
MKD     10
NOR     10
OMN     10
PAK     10
PLW     10
PAN     10
LBY     10
PNG     10
PRY     10
PER     10
PHL     10
POL     10
PRT     10
PRI     10
QAT     10
REU     10
ROU     10
RUS     10
RWA     10
ANT     10
NLD     10
NPL     10
NRU     10
LIE     10
LTU     10
LUX     10
MAC     10
MDG     10
MWI     10
MYS     10
MDV     10
MLI     10
MLT     10
MHL     10
MTQ     10
MRT     10
MUS     10
MYT     10
MEX     10
FSM     10
MDA     10
MNG     10
MNE     10
MSR     10
MAR     10
MOZ     10
MMR     10
NAM     10
SHN     10
AFG     10
LCA     10
TJK     10
THA     10
TLS     10
TGO     10
TON     10
TTO     10
TUN     10
TUR     10
TKM     10
TCA     10
TUV     10
UGA     10
UKR     10
ARE     10
GBR     10
USA     10
VIR     10
URY     10
UZB     10
VUT     10
VEN     10
VNM     10
WLF     10
ESH     10
YEM     10
ZMB     10
TZA     10
TWN     10
MAF     10
SYR     10
SPM     10

### There are 300 NANs, meaning 300 iso_codes are missing. 

In [81]:
# Check the frequencies for the "population" column

df_co2_recent['population'].value_counts(dropna = False)

NaN             198
2.400000e+03      6
4.781400e+04      2
1.087600e+04      2
3.905778e+07      1
5.137926e+06      1
2.113497e+06      1
2.114175e+06      1
2.111078e+06      1
2.103329e+06      1
5.019058e+06      1
5.080667e+06      1
5.190357e+06      1
3.838377e+07      1
2.111982e+06      1
5.277396e+06      1
5.312321e+06      1
5.348285e+06      1
5.379836e+06      1
5.403021e+06      1
5.236591e+06      1
3.046648e+07      1
2.107965e+06      1
2.538961e+07      1
5.942632e+08      1
5.971234e+08      1
2.488777e+07      1
2.500182e+07      1
2.512614e+07      1
2.525801e+07      1
2.551632e+07      1
2.105293e+06      1
2.563815e+07      1
2.575544e+07      1
2.586747e+07      1
2.597191e+07      1
2.099486e+06      1
2.102220e+06      1
2.110194e+06      1
4.040364e+07      1
3.972934e+07      1
2.053376e+08      1
2.109693e+08      1
2.135248e+08      1
2.163796e+08      1
2.197315e+08      1
2.232933e+08      1
2.271967e+08      1
2.314021e+08      1
1.797400e+04      1


### The variable contains 198 missing values (NaN).

In [82]:
# Check the frequencies for the "gdp" column

df_co2_recent['gdp'].value_counts(dropna = False)

NaN             1442
5.916690e+10       1
1.060806e+12       1
4.051714e+10       1
4.224717e+10       1
4.375089e+10       1
4.403295e+10       1
4.451847e+10       1
4.403376e+10       1
4.354517e+10       1
1.040792e+12       1
2.417466e+10       1
1.032470e+12       1
1.048705e+12       1
1.021286e+12       1
9.599389e+11       1
9.096492e+11       1
2.048344e+10       1
2.348710e+10       1
2.506270e+10       1
1.823131e+10       1
4.335459e+11       1
1.617338e+11       1
1.553137e+11       1
1.535239e+11       1
1.506124e+11       1
4.493390e+11       1
4.436183e+11       1
4.292192e+11       1
2.604228e+10       1
4.224157e+11       1
4.144629e+11       1
4.103650e+11       1
2.770358e+10       1
2.696999e+10       1
2.668143e+10       1
1.914251e+10       1
1.730786e+10       1
1.693705e+11       1
7.639998e+10       1
7.765823e+11       1
7.600774e+11       1
7.455177e+11       1
7.350931e+11       1
7.364979e+11       1
8.105245e+10       1
7.069779e+10       1
8.180560e+11 

### The variable contains 1442 missing values (NaN).

# 7. Exporting Dataframes

In [83]:
# Export the new dataframe df_co2(clean) in pkl format

df_co2.to_pickle(os.path.join(path, '02. Data','Prepared Data', 'emissions_clean.pkl'))

In [84]:
# Export the subsetted dataframe df_co2_recent(clean) in pkl format

df_co2_recent.to_pickle(os.path.join(path, '02. Data','Prepared Data', 'emissions_clean_recent.pkl'))