Name: Hanh Tran

Class: DSC 540

Assignment: Milestone 4

Connecting to an API/Pulling in the Data and Cleaning/Formatting

Perform at least 5 data transformation and/or cleansing steps to your API data. For example:

Replace Headers

Format data into a more readable format

Identify outliers and bad data

Find duplicates

Fix casing or inconsistent values

Conduct Fuzzy Matching


Register for API
https://www.eia.gov/opendata/

EIA Data Sets > International Energy Data > CO2 emissions > Emissions

API CALL TO USE:

http://api.eia.gov/category/?api_key=YOUR_API_KEY_HERE&category_id=2622652


In [6]:
pip install EIA_python

Collecting EIA_python
  Downloading eia_python-1.22-py3-none-any.whl (3.8 kB)
Installing collected packages: EIA-python
Successfully installed EIA-python-1.22
Note: you may need to restart the kernel to use updated packages.


In [2]:
import eia

In [3]:
from eia import api

In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import matplotlib.pyplot as plt
import html5lib
from fuzzywuzzy import fuzz
import re
import ssl
import urllib.request, urllib.parse, urllib.error
import json



In [5]:
#store actual apikey in external json file
with open('eiaAPIkeys.json') as f:
    keys=json.load(f)
    eiaapi=keys["APIkey"]

In [6]:
apikey= eiaapi

In [7]:
#define the base URL or API endpoint
serviceurl = 'http://api.eia.gov/category/?'
#use the requests library get method to read the HTML from the URL
response = requests.get(serviceurl)

In [8]:
#create a simple function that checks the http code status is OK, else handles connection errors gracefully
def httpstatus(res):
    if res.status_code==200:
        print('OK, Connection Successful!')
        return 1
    else:
        print('Bad Request')
        return -1

In [9]:
#check that simple function works
httpstatus(response)

OK, Connection Successful!


1

In [10]:
emissionsURL= 'https://api.eia.gov/category/?api_key='+apikey+'&category_id=2622652'

In [11]:
emissions = requests.get(emissionsURL)

In [12]:
emissions_json = emissions.json()

In [13]:
emissions_json

{'request': {'category_id': 2622652, 'command': 'category'},
 'category': {'category_id': '2622652',
  'parent_category_id': '2622644',
  'name': 'Emissions',
  'notes': '',
  'childcategories': [],
  'childseries': [{'series_id': 'INTL.4008-8-ABW-MMTCD.A',
    'name': 'CO2 emissions, Aruba, Annual',
    'f': 'A',
    'units': 'million metric tonnes carbon dioxide',
    'updated': '13-DEC-19 05.18.55 PM'},
   {'series_id': 'INTL.4008-8-AFG-MMTCD.A',
    'name': 'CO2 emissions, Afghanistan, Annual',
    'f': 'A',
    'units': 'million metric tonnes carbon dioxide',
    'updated': '23-JUN-20 02.32.17 PM'},
   {'series_id': 'INTL.4008-8-AFRC-MMTCD.A',
    'name': 'CO2 emissions, Africa, Annual',
    'f': 'A',
    'units': 'million metric tonnes carbon dioxide',
    'updated': '08-JUL-20 10.57.31 AM'},
   {'series_id': 'INTL.4008-8-AGO-MMTCD.A',
    'name': 'CO2 emissions, Angola, Annual',
    'f': 'A',
    'units': 'million metric tonnes carbon dioxide',
    'updated': '23-JUN-20 02.32.17

In [14]:
api_key = "e9184bcad732eeecbfacc5948d59e644"
api = eia.API(api_key)

In [15]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-AFRC-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

africa_df = pd.DataFrame(series_search, )

CO2 emissions, Africa, Annual (million metric tonnes carbon dioxide) {'2017  ': 1309.116061772, '2016  ': 1283.288041859, '2015  ': 1277.711311828, '2014  ': 1264.125405746, '2013  ': 1230.728633099, '2012  ': 1175.726351834, '2011  ': 1134.939982557, '2010  ': 1093.542050376, '2009  ': 1050.760118472, '2008  ': 1017.159874587, '2007  ': 1001.69741535, '2006  ': 1010.867138087, '2005  ': 987.974824629, '2004  ': 936.286984205, '2003  ': 916.020687735, '2002  ': 875.536896859, '2001  ': 843.590659017, '2000  ': 823.350414439, '1999  ': 805.432076271, '1998  ': 805.464233528, '1997  ': 785.851951874, '1996  ': 769.816169522, '1995  ': 748.387654851, '1994  ': 720.168428821, '1993  ': 709.440676075, '1992  ': 698.592561348, '1991  ': 691.970549498, '1990  ': 686.286773696, '1989  ': 662.268823994, '1988  ': 649.913248313, '1987  ': 635.000304951, '1986  ': 609.027405282, '1985  ': 592.275599625, '1984  ': 576.563667679, '1983  ': 547.30317855, '1982  ': 519.98539138, '1981  ': 475.8920651

In [16]:
africa_df.head()

Unnamed: 0,"CO2 emissions, Africa, Annual (million metric tonnes carbon dioxide)"
1980,451.742614
1981,475.892065
1982,519.985391
1983,547.303179
1984,576.563668


In [17]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-ASOC-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

asiaoceania_df = pd.DataFrame(series_search, )

CO2 emissions, Asia & Oceania, Annual (million metric tonnes carbon dioxide) {'2017  ': 17937.206434902, '2016  ': 17769.994687289, '2015  ': 17589.644265268, '2014  ': 17596.353457797, '2013  ': 17514.505614129, '2012  ': 16904.846954922, '2011  ': 16016.948254381, '2010  ': 14794.357635672, '2009  ': 13841.109047047, '2008  ': 13086.94049202, '2007  ': 12536.397884627, '2006  ': 12067.955936361, '2005  ': 11287.46142968, '2004  ': 10372.527352305, '2003  ': 9436.555549577, '2002  ': 8626.332700699, '2001  ': 8210.487443368, '2000  ': 7987.501383293, '1999  ': 7606.837819288, '1998  ': 7236.324996232, '1997  ': 7299.906844128, '1996  ': 7255.848666653, '1995  ': 6965.481780847, '1994  ': 6540.042257246, '1993  ': 6154.816072384, '1992  ': 5826.01295822, '1991  ': 5624.14882454, '1990  ': 5733.380183684, '1989  ': 5287.35322469, '1988  ': 5171.997081167, '1987  ': 4811.685243815, '1986  ': 4580.631519943, '1985  ': 4426.382121145, '1984  ': 4301.317538512, '1983  ': 3816.198913241, '19

In [18]:
asiaoceania_df.head()

Unnamed: 0,"CO2 emissions, Asia & Oceania, Annual (million metric tonnes carbon dioxide)"
1980,3616.117598
1981,3632.155147
1982,3647.879618
1983,3816.198913
1984,4301.317539


In [19]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-WP11-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

au_df = pd.DataFrame(series_search, )

CO2 emissions, Australia and New Zealand, Annual (million metric tonnes carbon dioxide) {'2016  ': 476.9006025435292, '2015  ': 467.25901685371196, '2014  ': 459.96939892246246, '2013  ': 460.47030799403706, '2012  ': 454.20332871682723, '2011  ': 451.4416438896688}


In [20]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-CAN-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

can_df = pd.DataFrame(series_search, )

CO2 emissions, Canada, Annual (million metric tonnes carbon dioxide) {'2017  ': 624.828229547, '2016  ': 607.184297291, '2015  ': 620.65064337, '2014  ': 614.004742084, '2013  ': 613.283255546, '2012  ': 605.866888806, '2011  ': 598.750569856, '2010  ': 595.535675761, '2009  ': 560.817951311, '2008  ': 579.873447808, '2007  ': 604.419636452, '2006  ': 597.449479418, '2005  ': 592.390125542, '2004  ': 592.477892223, '2003  ': 590.865107169, '2002  ': 563.700034621, '2001  ': 561.921045708, '2000  ': 550.533711533, '1999  ': 547.002094213, '1998  ': 524.523516058, '1997  ': 520.251220492, '1996  ': 505.343492136, '1995  ': 493.277182425, '1994  ': 483.301283376, '1993  ': 469.853759629, '1992  ': 470.04766499, '1991  ': 447.19029212, '1990  ': 455.768586695, '1989  ': 479.321625549, '1988  ': 466.848325075, '1987  ': 435.236482229, '1986  ': 416.661224941, '1985  ': 422.429759804, '1984  ': 420.754409938, '1983  ': 373.624092794, '1982  ': 385.052431123, '1981  ': 397.712988561, '1980  '

In [21]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-CHN-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

CHN_df = pd.DataFrame(series_search, )

CO2 emissions, China, Annual (million metric tonnes carbon dioxide) {'2017  ': 10486.98118719, '2016  ': 10501.798342084, '2015  ': 10508.330407099, '2014  ': 10701.496086103, '2013  ': 10801.773420247, '2012  ': 10362.203902872, '2011  ': 9832.278194475, '2010  ': 8779.18976365, '2009  ': 8188.765191226, '2008  ': 7496.677218307, '2007  ': 7043.430704459, '2006  ': 6739.961197019, '2005  ': 6105.598123867, '2004  ': 5375.481015106, '2003  ': 4625.453088715, '2002  ': 3960.514426268, '2001  ': 3694.868005381, '2000  ': 3523.15229409, '1999  ': 3269.736849903, '1998  ': 3115.70318377, '1997  ': 3122.09259099, '1996  ': 3222.867325554, '1995  ': 3130.023976455, '1994  ': 2902.149499667, '1993  ': 2716.804563555, '1992  ': 2522.473762421, '1991  ': 2441.592650567, '1990  ': 2666.577916739, '1989  ': 2397.017623213, '1988  ': 2401.88518111, '1987  ': 2254.601189919, '1986  ': 2132.603889638, '1985  ': 2059.549125934, '1984  ': 1963.509574578, '1983  ': 1810.848454635, '1982  ': 1710.955585

In [22]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-CSAM-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

southAmerica_df = pd.DataFrame(series_search, )

CO2 emissions, Central & South America, Annual (million metric tonnes carbon dioxide) {'2017  ': 1390.726233428, '2016  ': 1399.370600919, '2015  ': 1507.385833822, '2014  ': 1409.948164891, '2013  ': 1368.346993811, '2012  ': 1344.819833361, '2011  ': 1287.033492766, '2010  ': 1254.759776717, '2009  ': 1148.798396464, '2008  ': 1182.68921353, '2007  ': 1015.988818081, '2006  ': 1121.053138605, '2005  ': 1069.542812482, '2004  ': 1043.15635633, '2003  ': 1007.766511846, '2002  ': 983.862053775, '2001  ': 996.241073465, '2000  ': 972.831925259, '1999  ': 938.552574355, '1998  ': 929.372101962, '1997  ': 905.875138465, '1996  ': 862.726096552, '1995  ': 830.972134904, '1994  ': 785.048174692, '1993  ': 752.102701022, '1992  ': 712.370726525, '1991  ': 704.105312318, '1990  ': 694.339174652, '1989  ': 684.029055295, '1988  ': 664.640762818, '1987  ': 654.772034938, '1986  ': 644.461818254, '1985  ': 592.582504702, '1984  ': 583.928157066, '1983  ': 560.866277684, '1982  ': 578.442992766, 

In [23]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-EURO-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

eur_df = pd.DataFrame(series_search, )

CO2 emissions, Europe, Annual (million metric tonnes carbon dioxide) {'2017  ': 4364.508646541, '2016  ': 4292.74634094, '2015  ': 4277.971212081, '2014  ': 4219.535868797, '2013  ': 4367.859197116, '2012  ': 4449.596094896, '2011  ': 4510.229004269, '2010  ': 4600.675001099, '2009  ': 4481.06817393, '2008  ': 4793.869381097, '2007  ': 4843.798673698, '2006  ': 4889.386434175, '2005  ': 4858.559891106, '2004  ': 4845.302981445, '2003  ': 4785.792185764, '2002  ': 4681.985328431, '2001  ': 4689.395937436, '2000  ': 4630.737790808, '1999  ': 4555.266621251, '1998  ': 4629.996341578, '1997  ': 4601.133267301, '1996  ': 4646.890003559, '1995  ': 4523.582711879, '1994  ': 4416.975365512, '1993  ': 4440.003416165, '1992  ': 4531.718718539, '1991  ': 4671.261995618, '1990  ': 5004.757520872, '1989  ': 5126.952501161, '1988  ': 5114.543790932, '1987  ': 5151.177889415, '1986  ': 5095.808723253, '1985  ': 4990.917038344, '1984  ': 4861.167509252, '1983  ': 4523.75954001, '1982  ': 4588.50843983

In [24]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-USA-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

US_df = pd.DataFrame(series_search, )

CO2 emissions, United States, Annual (million metric tonnes carbon dioxide) {'2017  ': 5133.437, '2016  ': 5174.793, '2015  ': 5267.066, '2014  ': 5413.51, '2013  ': 5362.523, '2012  ': 5236.945, '2011  ': 5446.359, '2010  ': 5585.596, '2009  ': 5387.393, '2008  ': 5811.604, '2007  ': 6001.149, '2006  ': 5910.783, '2005  ': 5991.351, '2004  ': 5971.307, '2003  ': 5851.851, '2002  ': 5801.793, '2001  ': 5760.367, '2000  ': 5861.952, '1999  ': 5689.163, '1998  ': 5634.331, '1997  ': 5581.044, '1996  ': 5510.158, '1995  ': 5322.488, '1994  ': 5259.91, '1993  ': 5182.609, '1992  ': 5091.722, '1991  ': 4993.74, '1990  ': 5038.776, '1989  ': 5066.674, '1988  ': 4979.453, '1987  ': 4755.297, '1986  ': 4597.353, '1985  ': 4586.301, '1984  ': 4595.502, '1983  ': 4367.856, '1982  ': 4393.721, '1981  ': 4627.751, '1980  ': 4750.675}


In [25]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-SUN-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

RUS_df = pd.DataFrame(series_search, )

CO2 emissions, Former U.S.S.R., Annual (million metric tonnes carbon dioxide) {'2017  ': '--', '2016  ': '--', '2015  ': '--', '2014  ': '--', '2013  ': '--', '2012  ': '--', '2011  ': '--', '2010  ': '--', '2009  ': '--', '2008  ': '--', '2007  ': '--', '2006  ': '--', '2005  ': '--', '2004  ': '--', '2003  ': '--', '2002  ': '--', '2001  ': '--', '2000  ': '--', '1999  ': '--', '1998  ': '--', '1997  ': '--', '1996  ': '--', '1995  ': '--', '1994  ': '--', '1993  ': '--', '1992  ': '--', '1991  ': 3944.56951542, '1990  ': 4327.414643068, '1989  ': 4186.252614084, '1988  ': 4282.779575009, '1987  ': 4203.179447542, '1986  ': 4143.730267938, '1985  ': 3993.021317937, '1984  ': 3869.572625945, '1983  ': 3735.655096964, '1982  ': 3702.116460425, '1981  ': 3579.222544175, '1980  ': 3561.517210212}


In [26]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-IND-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

IN_df = pd.DataFrame(series_search, )

CO2 emissions, India, Annual (million metric tonnes carbon dioxide) {'2017  ': 2312.061218862, '2016  ': 2237.148562192, '2015  ': 2177.981891527, '2014  ': 2092.365758495, '2013  ': 1944.444971776, '2012  ': 1905.228814434, '2011  ': 1677.352792451, '2010  ': 1604.543494104, '2009  ': 1497.486494389, '2008  ': 1411.061330366, '2007  ': 1368.379922707, '2006  ': 1259.36055197, '2005  ': 1169.398363225, '2004  ': 1119.95641205, '2003  ': 1014.048346774, '2002  ': 993.931702371, '2001  ': 945.964699297, '2000  ': 922.563072813, '1999  ': 884.6787127, '1998  ': 839.484454222, '1997  ': 818.953296455, '1996  ': 782.663781032, '1995  ': 745.806810519, '1994  ': 689.708545086, '1993  ': 646.927392381, '1992  ': 617.732919442, '1991  ': 581.054459481, '1990  ': 548.64209131, '1989  ': 515.373742932, '1988  ': 488.86610388, '1987  ': 449.983082972, '1986  ': 417.367851353, '1985  ': 391.389288544, '1984  ': 367.63671597, '1983  ': 341.684036875, '1982  ': 322.590345745, '1981  ': 306.829835267

In [27]:
# Retrieve Data By Series ID
series_search = api.data_by_series(series="INTL.4008-8-MIDE-MMTCD.A")

for key,value in series_search.items():
    print(key,value)

ME_df = pd.DataFrame(series_search, )

CO2 emissions, Middle East, Annual (million metric tonnes carbon dioxide) {'2017  ': 2217.395677206, '2016  ': 2141.422804592, '2015  ': 2105.329606629, '2014  ': 2073.963529397, '2013  ': 2011.162102764, '2012  ': 1976.749297443, '2011  ': 1877.347117373, '2010  ': 1786.833752652, '2009  ': 1692.561491036, '2008  ': 1583.043594072, '2007  ': 1404.276405284, '2006  ': 1447.43777183, '2005  ': 1399.296499908, '2004  ': 1306.322628059, '2003  ': 1217.494207475, '2002  ': 1155.482158835, '2001  ': 1094.054266775, '2000  ': 1069.610419227, '1999  ': 1027.30376575, '1998  ': 990.754005306, '1997  ': 959.160234156, '1996  ': 880.332502301, '1995  ': 855.66671211, '1994  ': 843.336171898, '1993  ': 800.308019221, '1992  ': 765.606461756, '1991  ': 703.016229845, '1990  ': 695.109300153, '1989  ': 688.007335525, '1988  ': 652.289888787, '1987  ': 620.907085721, '1986  ': 586.894340586, '1985  ': 541.171415514, '1984  ': 494.221352494, '1983  ': 457.420599, '1982  ': 405.074019451, '1981  ': 38

In [28]:
country_df = pd.concat([au_df, can_df, CHN_df, southAmerica_df, eur_df, US_df, RUS_df, IN_df, ME_df], axis=1)

In [29]:
country_df

Unnamed: 0,"CO2 emissions, Australia and New Zealand, Annual (million metric tonnes carbon dioxide)","CO2 emissions, Canada, Annual (million metric tonnes carbon dioxide)","CO2 emissions, China, Annual (million metric tonnes carbon dioxide)","CO2 emissions, Central & South America, Annual (million metric tonnes carbon dioxide)","CO2 emissions, Europe, Annual (million metric tonnes carbon dioxide)","CO2 emissions, United States, Annual (million metric tonnes carbon dioxide)","CO2 emissions, Former U.S.S.R., Annual (million metric tonnes carbon dioxide)","CO2 emissions, India, Annual (million metric tonnes carbon dioxide)","CO2 emissions, Middle East, Annual (million metric tonnes carbon dioxide)"
2011,451.441644,598.75057,9832.278194,1287.033493,4510.229004,5446.359,--,1677.352792,1877.347117
2012,454.203329,605.866889,10362.203903,1344.819833,4449.596095,5236.945,--,1905.228814,1976.749297
2013,460.470308,613.283256,10801.77342,1368.346994,4367.859197,5362.523,--,1944.444972,2011.162103
2014,459.969399,614.004742,10701.496086,1409.948165,4219.535869,5413.51,--,2092.365758,2073.963529
2015,467.259017,620.650643,10508.330407,1507.385834,4277.971212,5267.066,--,2177.981892,2105.329607
2016,476.900603,607.184297,10501.798342,1399.370601,4292.746341,5174.793,--,2237.148562,2141.422805
1980,,421.349981,1664.569001,588.123405,4788.454905,4750.675,3561.52,271.466648,369.922419
1981,,397.712989,1648.030815,582.270585,4563.240313,4627.751,3579.22,306.829835,387.935334
1982,,385.052431,1710.955585,578.442993,4588.50844,4393.721,3702.12,322.590346,405.074019
1983,,373.624093,1810.848455,560.866278,4523.75954,4367.856,3735.66,341.684037,457.420599


## Replace Headers

In [30]:
country_df.rename(columns={'CO2 emissions, Australia and New Zealand, Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, AU&NZ',
                          'CO2 emissions, Canada, Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, Canada',
                           'CO2 emissions, China, Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, China',
                           'CO2 emissions, Central & South America, Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, South America',
                           'CO2 emissions, United States, Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, US',
                           'CO2 emissions, Former U.S.S.R., Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, Russia',
                           'CO2 emissions, India, Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, India',
                           'CO2 emissions, Middle East, Annual (million metric tonnes carbon dioxide)': 'CO2 emissions, Middle East'}, inplace=True)
country_df.head()

Unnamed: 0,"CO2 emissions, AU&NZ","CO2 emissions, Canada","CO2 emissions, China","CO2 emissions, South America","CO2 emissions, Europe, Annual (million metric tonnes carbon dioxide)","CO2 emissions, US","CO2 emissions, Russia","CO2 emissions, India","CO2 emissions, Middle East"
2011,451.441644,598.75057,9832.278194,1287.033493,4510.229004,5446.359,--,1677.352792,1877.347117
2012,454.203329,605.866889,10362.203903,1344.819833,4449.596095,5236.945,--,1905.228814,1976.749297
2013,460.470308,613.283256,10801.77342,1368.346994,4367.859197,5362.523,--,1944.444972,2011.162103
2014,459.969399,614.004742,10701.496086,1409.948165,4219.535869,5413.51,--,2092.365758,2073.963529
2015,467.259017,620.650643,10508.330407,1507.385834,4277.971212,5267.066,--,2177.981892,2105.329607


In [31]:
print(country_df.columns.values.tolist())

['CO2 emissions, AU&NZ', 'CO2 emissions, Canada', 'CO2 emissions, China', 'CO2 emissions, South America', 'CO2 emissions, Europe, Annual (million metric tonnes carbon dioxide)', 'CO2 emissions, US', 'CO2 emissions, Russia', 'CO2 emissions, India', 'CO2 emissions, Middle East']


In [32]:
print(country_df.keys())

Index(['CO2 emissions, AU&NZ', 'CO2 emissions, Canada', 'CO2 emissions, China',
       'CO2 emissions, South America',
       'CO2 emissions, Europe, Annual (million metric tonnes carbon dioxide)',
       'CO2 emissions, US', 'CO2 emissions, Russia', 'CO2 emissions, India',
       'CO2 emissions, Middle East'],
      dtype='object')


In [33]:
#remove duplicates
country_df.duplicated(subset=None, keep='first')

2011      False
2012      False
2013      False
2014      False
2015      False
2016      False
1980      False
1981      False
1982      False
1983      False
1984      False
1985      False
1986      False
1987      False
1988      False
1989      False
1990      False
1991      False
1992      False
1993      False
1994      False
1995      False
1996      False
1997      False
1998      False
1999      False
2000      False
2001      False
2002      False
2003      False
2004      False
2005      False
2006      False
2007      False
2008      False
2009      False
2010      False
2017      False
dtype: bool

In [34]:
country_df.shape

(38, 9)

In [35]:
country_df.drop_duplicates()

Unnamed: 0,"CO2 emissions, AU&NZ","CO2 emissions, Canada","CO2 emissions, China","CO2 emissions, South America","CO2 emissions, Europe, Annual (million metric tonnes carbon dioxide)","CO2 emissions, US","CO2 emissions, Russia","CO2 emissions, India","CO2 emissions, Middle East"
2011,451.441644,598.75057,9832.278194,1287.033493,4510.229004,5446.359,--,1677.352792,1877.347117
2012,454.203329,605.866889,10362.203903,1344.819833,4449.596095,5236.945,--,1905.228814,1976.749297
2013,460.470308,613.283256,10801.77342,1368.346994,4367.859197,5362.523,--,1944.444972,2011.162103
2014,459.969399,614.004742,10701.496086,1409.948165,4219.535869,5413.51,--,2092.365758,2073.963529
2015,467.259017,620.650643,10508.330407,1507.385834,4277.971212,5267.066,--,2177.981892,2105.329607
2016,476.900603,607.184297,10501.798342,1399.370601,4292.746341,5174.793,--,2237.148562,2141.422805
1980,,421.349981,1664.569001,588.123405,4788.454905,4750.675,3561.52,271.466648,369.922419
1981,,397.712989,1648.030815,582.270585,4563.240313,4627.751,3579.22,306.829835,387.935334
1982,,385.052431,1710.955585,578.442993,4588.50844,4393.721,3702.12,322.590346,405.074019
1983,,373.624093,1810.848455,560.866278,4523.75954,4367.856,3735.66,341.684037,457.420599


In [36]:
country_df.shape

(38, 9)

In [37]:
#find missing values, Australia is lacking data from earlier years
country_df.isna().sum()

CO2 emissions, AU&NZ                                                    32
CO2 emissions, Canada                                                    0
CO2 emissions, China                                                     0
CO2 emissions, South America                                             0
CO2 emissions, Europe, Annual (million metric tonnes carbon dioxide)     0
CO2 emissions, US                                                        0
CO2 emissions, Russia                                                    0
CO2 emissions, India                                                     0
CO2 emissions, Middle East                                               0
dtype: int64

In [38]:
#conduct fuzzy matching on string data
fuzz.ratio('Africa', 'Africa')

100

In [40]:
country_df.to_csv('emissions.csv')