In [1]:
import seaborn as sns
import matplotlib as plt
import numpy as np
import pandas as pd

In [2]:
url_co2 = 'https://raw.githubusercontent.com/TrainingByPackt/Interactive-Data-Visualization-with-Python/master/datasets/co2.csv'
co2 = pd.read_csv(url_co2)
url_gm = 'https://raw.githubusercontent.com/TrainingByPackt/Interactive-Data-Visualization-with-Python/master/datasets/gapminder.csv'
gm = pd.read_csv(url_gm)

In [3]:
co2.head()

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
0,Afghanistan,,,,,,,,,,...,0.0529,0.0637,0.0854,0.154,0.242,0.294,0.412,0.35,0.316,0.299
1,Albania,,,,,,,,,,...,1.38,1.28,1.3,1.46,1.48,1.56,1.79,1.68,1.73,1.96
2,Algeria,,,,,,,,,,...,3.22,2.99,3.19,3.16,3.42,3.3,3.29,3.46,3.51,3.72
3,Andorra,,,,,,,,,,...,7.3,6.75,6.52,6.43,6.12,6.12,5.87,5.92,5.9,5.83
4,Angola,,,,,,,,,,...,0.98,1.1,1.2,1.18,1.23,1.24,1.25,1.33,1.25,1.29


In [4]:
gm.head()

Unnamed: 0,Country,Year,fertility,life,population,child_mortality,gdp,region
0,Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0,South Asia
1,Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0,South Asia
2,Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0,South Asia
3,Afghanistan,1967,7.671,35.17,11163656.0,323.3,1173.0,South Asia
4,Afghanistan,1968,7.671,35.674,11411022.0,318.1,1187.0,South Asia


In [5]:
#Use .drop_duplicates() to remove the duplicate instances from the gm
#DataFrame and save this in a new DataFrame called df_gm:
df_gm = gm[['Country', 'region']].drop_duplicates()

In [6]:
df_gm.head()

Unnamed: 0,Country,region
0,Afghanistan,South Asia
50,Albania,Europe & Central Asia
100,Algeria,Middle East & North Africa
150,Angola,Sub-Saharan Africa
200,Antigua and Barbuda,America


In [7]:
# Use .merge() to combine the co2 DataFrame with the df_gm DataFrame. This
# merge function basically performs an inner join on the two DataFrames 
df_w_regions = pd.merge(co2, df_gm, left_on ='country', right_on='Country', how ='inner')
df_w_regions.head()

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2007,2008,2009,2010,2011,2012,2013,2014,Country,region
0,Afghanistan,,,,,,,,,,...,0.0854,0.154,0.242,0.294,0.412,0.35,0.316,0.299,Afghanistan,South Asia
1,Albania,,,,,,,,,,...,1.3,1.46,1.48,1.56,1.79,1.68,1.73,1.96,Albania,Europe & Central Asia
2,Algeria,,,,,,,,,,...,3.19,3.16,3.42,3.3,3.29,3.46,3.51,3.72,Algeria,Middle East & North Africa
3,Angola,,,,,,,,,,...,1.2,1.18,1.23,1.24,1.25,1.33,1.25,1.29,Angola,Sub-Saharan Africa
4,Antigua and Barbuda,,,,,,,,,,...,5.14,5.19,5.45,5.54,5.36,5.42,5.36,5.38,Antigua and Barbuda,America


In [8]:
df_w_regions = df_w_regions.drop('Country', axis='columns')

In [9]:
new_co2 = pd.melt(df_w_regions, id_vars=['country', 'region'])
columns = ['country', 'region', 'year', 'co2']
new_co2.columns = columns
new_co2

Unnamed: 0,country,region,year,co2
0,Afghanistan,South Asia,1800,
1,Albania,Europe & Central Asia,1800,
2,Algeria,Middle East & North Africa,1800,
3,Angola,Sub-Saharan Africa,1800,
4,Antigua and Barbuda,America,1800,
...,...,...,...,...
37190,Vanuatu,East Asia & Pacific,2014,0.595
37191,Venezuela,America,2014,6.030
37192,Vietnam,East Asia & Pacific,2014,1.800
37193,Zambia,Sub-Saharan Africa,2014,0.288


In [10]:
df_co2 = new_co2[new_co2['year'].astype('int64') > 1963]
df_co2 = df_co2.sort_values(by=['country', 'year'])
df_co2['year'] = df_co2['year'].astype('int64')
df_co2.head()

Unnamed: 0,country,region,year,co2
28372,Afghanistan,South Asia,1964,0.0863
28545,Afghanistan,South Asia,1965,0.101
28718,Afghanistan,South Asia,1966,0.108
28891,Afghanistan,South Asia,1967,0.124
29064,Afghanistan,South Asia,1968,0.116


In [11]:
# Create a new DataFrame called df_gdp that consists of the country, year, and
# gdp columns from the gm DataFrame
df_gdp = gm[['Country', 'Year', 'gdp']]
df_gdp.columns = ['country', 'year', 'gdp']
df_gdp.head() 

Unnamed: 0,country,year,gdp
0,Afghanistan,1964,1182.0
1,Afghanistan,1965,1182.0
2,Afghanistan,1966,1168.0
3,Afghanistan,1967,1173.0
4,Afghanistan,1968,1187.0


In [12]:
data = pd.merge(df_co2, df_gdp, on=['country', 'year'], how='left')
data = data.dropna()
data.head()

Unnamed: 0,country,region,year,co2,gdp
0,Afghanistan,South Asia,1964,0.0863,1182.0
1,Afghanistan,South Asia,1965,0.101,1182.0
2,Afghanistan,South Asia,1966,0.108,1168.0
3,Afghanistan,South Asia,1967,0.124,1173.0
4,Afghanistan,South Asia,1968,0.116,1187.0


In [13]:
#Create a numpy array of the co2 and gdp columns:
np_co2 = np.array(data['co2'])
np_gdp = np.array(data['gdp'])

In [14]:
np.corrcoef(np_co2, np_gdp) 

array([[1.        , 0.78219731],
       [0.78219731, 1.        ]])

In [15]:
from bokeh.io import curdoc, output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource,CategoricalColorMapper, Slider
from bokeh.palettes import Spectral6
from bokeh.layouts import widgetbox, row

In [16]:
output_notebook()

In [17]:
regions_list = data.region.unique().tolist()

In [18]:
color_mapper = CategoricalColorMapper(factors=regions_list,
palette=Spectral6)

In [19]:
source = ColumnDataSource(data={
 'x': data.gdp[data['year'] == 1964],
 'y': data.co2[data['year'] == 1964],
 'country': data.country[data['year'] == 1964],
 'region': data.region[data['year'] == 1964],
})

In [20]:
xmin, xmax = min(data.gdp), max(data.gdp)

In [21]:
ymin, ymax = min(data.co2), max(data.co2)

In [22]:
plot = figure(title='CO2 Emissions vs GDP in 1964',
     plot_height=600, plot_width=1000,
     x_range=(xmin, xmax),
     y_range=(ymin, ymax), y_axis_type='log')


In [23]:
plot.circle(x='x', y='y', fill_alpha=0.8, source=source,
legend='region', color=dict(field='region', transform=color_mapper),
size=7)



In [24]:
plot.legend.location = 'bottom_right'

In [25]:
plot.xaxis.axis_label = 'Income Per Person'

In [26]:
plot.yaxis.axis_label = 'CO2 Emissions (tons per person)'

In [27]:
show(plot)

In [28]:
slider = Slider(start=min(data.year), end=max(data.year), step=1,
value=min(data.year), title='Year')

In [29]:
def update_plot(attr, old, new):
    yr = slider.value
    new_data = {
         'x': data.gdp[data['year'] == yr],
         'y': data.co2[data['year'] == yr],
         'country': data.country[data['year'] == yr],
         'region': data.region[data['year'] == yr],
        }
    source.data = new_data
    plot.title.text = 'CO2 Emissions vs GDP in %d' % yr

In [30]:
slider.on_change('value', update_plot)

In [31]:
layout = row(widgetbox(slider), plot)



In [32]:
curdoc().add_root(layout)

In [33]:
hover = HoverTool(tooltips=[('Country', '@country'), ('GDP', '@x'),
('CO2 Emission', '@y')])


In [34]:
plot.add_tools(hover)