In [1]:
#loading dependencies
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

df = pd.read_csv("./Methane_final.csv")
df

Unnamed: 0.1,Unnamed: 0,region,country,emissions,type,segment,reason,baseYear,notes
0,0,Africa,Algeria,257.611206,Agriculture,Total,All,2019-2021,Average based on United Nations Framework Conv...
1,1,Africa,Algeria,0.052000,Energy,Bioenergy,All,2022,Estimates from end-uses are for 2020 or 2021 (...
2,2,Africa,Algeria,130.798996,Energy,Gas pipelines and LNG facilities,Fugitive,2022,Not available
3,3,Africa,Algeria,69.741898,Energy,Gas pipelines and LNG facilities,Vented,2022,Not available
4,4,Africa,Algeria,213.987000,Energy,Onshore gas,Fugitive,2022,Not available
...,...,...,...,...,...,...,...,...,...
1543,1543,World,World,3102.500000,Energy,Satellite-detected large oil and gas emissions,All,2022,Not available
1544,1544,World,World,30296.500000,Energy,Steam coal,All,2022,Not available
1545,1545,World,World,133350.984375,Energy,Total,All,2022,Estimates from end-uses are for 2020 or 2021 (...
1546,1546,World,World,9737.874023,Other,Total,All,2019-2021,Average based on United Nations Framework Conv...


In [2]:
#checking column names
df.columns

Index(['Unnamed: 0', 'region', 'country', 'emissions', 'type', 'segment',
       'reason', 'baseYear', 'notes'],
      dtype='object')

In [3]:
#renaming columns
clean_df = df.rename(columns={
    "baseYear": "baseyear",
    "Unnamed: 0": "unnamed"
})

clean_df.columns

Index(['unnamed', 'region', 'country', 'emissions', 'type', 'segment',
       'reason', 'baseyear', 'notes'],
      dtype='object')

In [4]:
#removing unneeded 'unnamed' column
clean_df.drop('unnamed', inplace=True, axis=1)

clean_df

Unnamed: 0,region,country,emissions,type,segment,reason,baseyear,notes
0,Africa,Algeria,257.611206,Agriculture,Total,All,2019-2021,Average based on United Nations Framework Conv...
1,Africa,Algeria,0.052000,Energy,Bioenergy,All,2022,Estimates from end-uses are for 2020 or 2021 (...
2,Africa,Algeria,130.798996,Energy,Gas pipelines and LNG facilities,Fugitive,2022,Not available
3,Africa,Algeria,69.741898,Energy,Gas pipelines and LNG facilities,Vented,2022,Not available
4,Africa,Algeria,213.987000,Energy,Onshore gas,Fugitive,2022,Not available
...,...,...,...,...,...,...,...,...
1543,World,World,3102.500000,Energy,Satellite-detected large oil and gas emissions,All,2022,Not available
1544,World,World,30296.500000,Energy,Steam coal,All,2022,Not available
1545,World,World,133350.984375,Energy,Total,All,2022,Estimates from end-uses are for 2020 or 2021 (...
1546,World,World,9737.874023,Other,Total,All,2019-2021,Average based on United Nations Framework Conv...


In [7]:
#checking if there are any null values
clean_df.isnull().sum()

region       0
country      0
emissions    0
type         0
segment      0
reason       0
baseyear     0
notes        0
dtype: int64

In [8]:
#checking to see each region
clean_df["region"].unique()

array(['Africa', 'Asia Pacific', 'Central and South America', 'Europe',
       'Middle East', 'North America', 'Other', 'Russia & Caspian',
       'World'], dtype=object)

In [9]:
#checking to see each region's emissions counts
clean_df["region"].value_counts()

Africa                       406
Europe                       302
Asia Pacific                 270
Middle East                  187
Central and South America    183
Russia & Caspian              96
North America                 61
World                         22
Other                         21
Name: region, dtype: int64

In [10]:
#checking list of countries
clean_df["country"].unique()

array(['Algeria', 'Angola', 'Benin', 'Botswana', 'Cameroon',
       'Central African Republic', 'Chad', 'Congo', "Cote d'Ivoire",
       'Democratic Republic of Congo', 'Egypt', 'Equatorial Guinea',
       'Eritrea', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea',
       'Guinea-Bissau', 'Kenya', 'Liberia', 'Libya', 'Morocco',
       'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Senegal',
       'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa',
       'South Sudan', 'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Australia',
       'Bangladesh', 'Brunei', 'China', 'India', 'Indonesia', 'Japan',
       'Korea', 'Malaysia', 'Mongolia', 'New Zealand',
       'Other countries in Southeast Asia', 'Pakistan', 'Philippines',
       'Thailand', 'Vietnam', 'Argentina', 'Bolivia', 'Brazil',
       'Colombia', 'Cuba', 'Ecuador', 'Guyana', 'Paraguay', 'Peru',
       'Trinidad and Tobago', 'Uruguay', 'Venezuela', 'Denmark',
       'Estonia', 'European Union', 'France', 'Germany', 'Israel',
      

In [11]:
#removing World country from dataframe
cleaner_df=clean_df[clean_df["country"]!="World"]
cleaner_df

Unnamed: 0,region,country,emissions,type,segment,reason,baseyear,notes
0,Africa,Algeria,257.611206,Agriculture,Total,All,2019-2021,Average based on United Nations Framework Conv...
1,Africa,Algeria,0.052000,Energy,Bioenergy,All,2022,Estimates from end-uses are for 2020 or 2021 (...
2,Africa,Algeria,130.798996,Energy,Gas pipelines and LNG facilities,Fugitive,2022,Not available
3,Africa,Algeria,69.741898,Energy,Gas pipelines and LNG facilities,Vented,2022,Not available
4,Africa,Algeria,213.987000,Energy,Onshore gas,Fugitive,2022,Not available
...,...,...,...,...,...,...,...,...
1521,Russia & Caspian,Uzbekistan,16.973917,Energy,Other from oil and gas,All,2022,Estimates from end-uses are for 2020 or 2021 (...
1522,Russia & Caspian,Uzbekistan,18.299999,Energy,Satellite-detected large oil and gas emissions,All,2022,Not available
1523,Russia & Caspian,Uzbekistan,780.916138,Energy,Total,All,2022,Estimates from end-uses are for 2020 or 2021 (...
1524,Russia & Caspian,Uzbekistan,3.845616,Other,Total,All,2019-2021,Average based on United Nations Framework Conv...


In [None]:
#checking list of countries
cleaner_df["country"].value_counts()

In [None]:
#checking list of reasons
cleaner_df["reason"].unique()

In [None]:
#checking list of each reason count
cleaner_df["reason"].value_counts()

In [None]:
#checking list of each emission type
cleaner_df["type"].unique()

In [None]:
#checking list of each emission types counts
cleaner_df["type"].value_counts()

In [None]:
#showing top 5 values of Energy emissions
energy_df=cleaner_df[cleaner_df['type']=='Energy']
energy_df.head()

In [None]:
#showing top 5 values of Agriculture emissions
ag_df=cleaner_df[cleaner_df['type']=='Agriculture']
ag_df.head()

In [None]:
#showing top 5 values of Other emissions
other_df=cleaner_df[cleaner_df['type']=='Other']
other_df.head()

In [None]:
#showing top 5 values of Waste emissions
waste_df=cleaner_df[cleaner_df['type']=='Waste']
waste_df.head()

In [13]:
#emissions totals by type
total_df=cleaner_df.loc[np.where((cleaner_df['segment']=='Total'))]
total_df.groupby('type').sum().reset_index()

Unnamed: 0,type,emissions
0,Agriculture,150805.103144
1,Energy,135347.173195
2,Other,10028.772862
3,Waste,75079.707008


In [14]:
#emissions totals for each country
total_df=cleaner_df.loc[np.where((cleaner_df['segment']=='Total'))]
total_df.groupby('country').sum().reset_index()

Unnamed: 0,country,emissions
0,Algeria,3444.247114
1,Angola,1202.906883
2,Argentina,5251.163147
3,Australia,5543.555191
4,Azerbaijan,623.687966
...,...,...
99,Uruguay,901.324079
100,Uzbekistan,1966.862615
101,Venezuela,4048.541238
102,Vietnam,3072.738373


In [20]:
#emissions totals for each region
total_df=cleaner_df.loc[np.where((cleaner_df['segment']=='Total'))]
total_df.groupby('region').sum().reset_index()

Unnamed: 0,region,emissions
0,Africa,40929.213518
1,Asia Pacific,136079.525927
2,Central and South America,38316.827054
3,Europe,42326.519342
4,Middle East,21825.386844
5,North America,42448.051056
6,Other,13576.477112
7,Russia & Caspian,35758.755358


In [None]:
px.pie(cleaner_df,values='emissions',names='type',hole=0.5)

In [None]:
# Push the remade DataFrame to a new CSV file
clean_df.to_csv("./output/clean_emissions_data.csv",
                  encoding="utf-8", index=False, header=True)