## Using World Development Indicators

In [2]:
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

In [3]:
#Let's take a look at the data
data = pd.read_csv('C:/Users/Josh/Desktop/Python for Data Science/Week 5 Visualization/Indicators.csv')
data.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,133.5609
1,Arab World,ARB,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,87.7976
2,Arab World,ARB,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,6.634579
3,Arab World,ARB,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,81.02333
4,Arab World,ARB,Arms exports (SIPRI trend indicator values),MS.MIL.XPRT.KD,1960,3000000.0


In [4]:
#How many distinct countries, indicators, and years
print("The data has", data.shape[0], "rows and", data.shape[1], "columns.")
print("There are", len(set(data['CountryCode'])), "countries.")
print("There are", len(set(data['IndicatorCode'])), "indicators.")
print("There are", len(set(data['Year'])), "years.")

The data has 5656458 rows and 6 columns.
There are 247 countries.
There are 1344 indicators.
There are 56 years.


In [5]:
#Isolate indicator codes for life expectancy and GDP
gdp_cap = data['IndicatorCode'].str.contains('NY\.GDP\.PCAP\.KD$')
total_exp = data['IndicatorCode'].str.contains('SP\.DYN\.LE00\.IN$')

In [6]:
#Create masks for GDP and Life Exp.
gdp_stage = data[gdp_cap]
life_stage = data[total_exp]

#Check if there are any nulls
print(gdp_stage.isnull().any())
print(life_stage.isnull().any())

CountryName      False
CountryCode      False
IndicatorName    False
IndicatorCode    False
Year             False
Value            False
dtype: bool
CountryName      False
CountryCode      False
IndicatorName    False
IndicatorCode    False
Year             False
Value            False
dtype: bool


In [19]:
#Make box plots
life_expectancy = go.Box(y=life_stage['Value'])
GDPPC = go.Box(y=gdp_stage['Value'])

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=(str(life_stage['IndicatorName'].iloc[0]), 
                                                          str(gdp_stage['IndicatorName'].iloc[0])))
fig.append_trace(life_expectancy, 1, 1)
fig.append_trace(GDPPC, 1, 2)

fig['layout']['yaxis1'].update(title='Age in Years')
fig['layout']['yaxis2'].update(title='2005 Dollars')

py.iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]



![Box Plot](C:/Users/Josh/Downloads/Box Plot.png)

In [8]:
#Check how many years each data set contains
print("GDP Min Year =", gdp_stage['Year'].min(), "max:", gdp_stage['Year'].max())
print("Life Exp Min Year =", life_stage['Year'].min(), "max:", life_stage['Year'].max())

GDP Min Year = 1960 max: 2014
Life Exp Min Year = 1960 max: 2013


In [9]:
#Filter GDP data to between 2000 and 2013
gdp_stage_trunc = gdp_stage[(gdp_stage['Year'] < 2014) & (gdp_stage['Year'] > 1999)]
print("GDP Min Year = ", gdp_stage_trunc['Year'].min(), "max: ", gdp_stage_trunc['Year'].max())

#Filter life expectancy data to between 2000 and 2013
life_stage = life_stage[(life_stage['Year'] < 2014) & (life_stage['Year'] > 1999)]
print("Life Exp Min Year = ", life_stage['Year'].min(), "max: ", life_stage['Year'].max())

GDP Min Year =  2000 max:  2013
Life Exp Min Year =  2000 max:  2013


In [10]:
#Check the number of countries in each set
print('GDP Countries: ' + str(len(set(gdp_stage_trunc['CountryName']))))
print('Life Exp. Countries: ' + str(len(set(life_stage['CountryName']))))

#Check all country differences between the two
print(np.setxor1d(gdp_stage_trunc['CountryName'],life_stage['CountryName']))

GDP Countries: 232
Life Exp. Countries: 240
['Andorra' 'Curacao' 'French Polynesia' 'Guam' 'Korea, Dem. Rep.' 'Monaco'
 'Myanmar' 'New Caledonia' 'Sint Maarten (Dutch part)' 'Somalia'
 'South Sudan' 'St. Martin (French part)' 'Tuvalu' 'Virgin Islands (U.S.)']


In [11]:
#Distinct countries in the datasets are not the same
#Which dataset has unique countries that the other does not contain
print("We have GDP information but not life expectancy for:", np.setdiff1d(gdp_stage_trunc['CountryName'],life_stage['CountryName']))
print("We have life expectancy information but not GDP for:", np.setdiff1d(life_stage['CountryName'],gdp_stage_trunc['CountryName']))

We have GDP information but not life expectancy for: ['Andorra' 'Monaco' 'Tuvalu']
We have life expectancy information but not GDP for: ['Curacao' 'French Polynesia' 'Guam' 'Korea, Dem. Rep.' 'Myanmar'
 'New Caledonia' 'Sint Maarten (Dutch part)' 'Somalia' 'South Sudan'
 'St. Martin (French part)' 'Virgin Islands (U.S.)']


In [12]:
#Remove countries that do not intersect
life_stage_trunc = life_stage[~life_stage.CountryName.isin(np.setdiff1d(life_stage['CountryName'],gdp_stage_trunc['CountryName']))]
gdp_stage_trunc = gdp_stage_trunc[~gdp_stage_trunc.CountryName.isin(np.setdiff1d(gdp_stage_trunc['CountryName'],life_stage['CountryName']))]
print(np.setxor1d(gdp_stage_trunc['CountryName'],life_stage_trunc['CountryName']))

[]


In [13]:
#Check to see if each country is represented the same number of times
newdf = gdp_stage_trunc.groupby(['CountryName'])['CountryCode'].count() - life_stage_trunc.groupby(['CountryName'])['CountryCode'].count()
newdf.iloc[newdf.nonzero()]

CountryName
Afghanistan             -2
Angola                 -13
Aruba                   -4
Channel Islands         -6
Dominica                13
Eritrea                 -2
Faeroe Islands         -13
Greenland               -2
Isle of Man              7
Liechtenstein           -4
Maldives                -1
Marshall Islands        13
Palau                   12
San Marino              -3
Seychelles               2
St. Kitts and Nevis     13
Syrian Arab Republic    -6
Name: CountryCode, dtype: int64

In [14]:
#Remove countries that are not represented the same number of times
countriesrm = ['Afghanistan','Angola','Aruba','Channel Islands','Dominica','Eritrea','Faeroe Islands','Greenland','Isle of Man',
             'Liechtenstein', 'Maldives','Marshall Islands','Palau','San Marino','Seychelles','St. Kitts and Nevis','Syrian Arab Republic']
life_stage_trunc = life_stage_trunc[~life_stage_trunc.CountryName.isin(countriesrm)]
gdp_stage_trunc = gdp_stage_trunc[~gdp_stage_trunc.CountryName.isin(countriesrm)]

In [15]:
#Create a scatterplot of GDP vs Life Exp.
trace = go.Scatter(
    x = life_stage_trunc['Value'],
    y = gdp_stage_trunc['Value'],
    mode = 'markers'
)

layout = go.Layout(
    title= 'Life Expectancy vs. GDP Per Capita <br>' + str(gdp_stage_trunc['Year'].min()) + ' to ' + 
               str(gdp_stage_trunc['Year'].max()),
    xaxis= dict(
        title= str(life_stage['IndicatorName'].iloc[0]),
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= str(gdp_stage_trunc['IndicatorName'].iloc[0]),
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= False
)

fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)

In [16]:
#Create a dataframe with mean GDP and Life Exp.
df_merge = pd.merge(gdp_stage_trunc, life_stage_trunc,how = 'left', left_on = ['CountryName','Year'],right_on = ['CountryName','Year'])
df_merge.rename(columns={'Value_x':'GDPPC', 'Value_y':'Life Expectancy'}, inplace=True)
map_data = df_merge.groupby(['CountryName'], as_index=False)['GDPPC', 'Life Expectancy'].mean()
map_data = map_data.round(2)
map_data.head(5)

Unnamed: 0,CountryName,GDPPC,Life Expectancy
0,Albania,2977.19,76.2
1,Algeria,3041.94,72.59
2,Antigua and Barbuda,11980.66,74.66
3,Arab World,3913.8,69.23
4,Argentina,6163.91,74.91


In [17]:
for col in map_data.columns:
    map_data[col] = map_data[col].astype(str)

map_data['text'] = 'GDPPC: $' + map_data['GDPPC'] + '<br>' + 'Life Exp: ' + map_data['Life Expectancy']

data = [ dict(
        type = 'choropleth',
        locations = map_data['CountryName'],
        locationmode = 'country names',
        z = map_data['GDPPC'],
        text = map_data['text'],
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            tickprefix = '$',
            title = 'GDP per Capita'),
      ) ]

layout = dict(
    title = '',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict(data=data, layout=layout)
py.iplot(fig)