# Homework 4. Solution key

In [1]:
import numpy as np
import pandas as pd
from pandas.io import wb

pd.set_option('float_format', '{:6.2f}'.format)

In [2]:
df = wb.search('internet').iloc[:, :2]
print(df.shape)
print(df)

(25, 2)
                     id                                               name
52          2.0.cov.Int                                Coverage: Internet 
77          2.0.hoi.Int                                     HOI: Internet 
5621        IT.NET.BNDW            International Internet bandwidth (Mbps)
5622     IT.NET.BNDW.PC  International Internet bandwidth (bits per per...
5623     IT.NET.CONN.CD  Fixed broadband Internet connection charge (cu...
5624     IT.NET.CONN.CN  Fixed broadband Internet connection charge (cu...
5625        IT.NET.SECR                            Secure Internet servers
5626     IT.NET.SECR.P6     Secure Internet servers (per 1 million people)
5627      IT.NET.SUB.CD  Fixed broadband Internet monthly subscription ...
5628      IT.NET.SUB.CN  Fixed broadband Internet monthly subscription ...
5629        IT.NET.USER                                     Internet users
5630  IT.NET.USER.FE.ZS                         Internet users, female (%)
5631  IT.NET.USER

# Q1

Download the following data from the World Bank "World Development Indicators" repository (http://databank.worldbank.org/data/home.aspx):

- inflation ('FP.CPI.TOTL.ZG')
- GDP per capita ('NY.GDP.PCAP.CD')
- Internet users ('IT.NET.USER.P2')

Use Pandas remote data access module (http://pandas.pydata.org/pandas-docs/version/0.16.2/remote_data.html#remote-data-wb).

In [3]:
ind = {'FP.CPI.TOTL.ZG': 'Inflation',
       'NY.GDP.PCAP.CD': 'GDP per capita',
       'IT.NET.USER.P2': 'Internet users'}

downloaded = wb.download(indicator=ind.keys(), country='all', start=2000, end=2014)

print(downloaded.head())

- Print the shape of the dataset (number of rows and columns).
- Print the first three and last three rows in the dataset.
- Print basic descriptive statistics.

In [4]:
print(downloaded.shape)
print(downloaded.head(3))
print(downloaded.tail(3))
print(downloaded.describe())

(3735, 3)
                 FP.CPI.TOTL.ZG  IT.NET.USER.P2  NY.GDP.PCAP.CD
country    year                                                
Arab World 2014            2.79           34.52         7412.52
           2013            3.16           32.34         7540.46
           2012            4.25           29.95         7442.14
               FP.CPI.TOTL.ZG  IT.NET.USER.P2  NY.GDP.PCAP.CD
country  year                                                
Zimbabwe 2002             nan            3.99          499.72
         2001             nan            0.80          537.72
         2000             nan            0.40          535.20
       FP.CPI.TOTL.ZG  IT.NET.USER.P2  NY.GDP.PCAP.CD
count         3102.00         3480.00         3422.00
mean             6.17           26.31        12279.80
std             15.00           26.38        19444.36
min            -18.11            0.00          106.02
25%              2.17            3.77         1151.95
50%              3.99           16.1

- Rename columns to give them some meaningful names. Print first five rows of the dataset.

In [5]:
data = downloaded.copy()
data.rename(columns=ind, inplace=True)

print(data.head())

                 Inflation  Internet users  GDP per capita
country    year                                           
Arab World 2014       2.79           34.52         7412.52
           2013       3.16           32.34         7540.46
           2012       4.25           29.95         7442.14
           2011       4.64           26.53         6900.38
           2010       3.95           24.54         5949.41


Note that the index level 'year' is a `string`.

- Convert it to `DatetimeIntex` and name it 'date'. Show the first three values.

In [6]:
data['date'] = pd.to_datetime(data.index.get_level_values('year'), infer_datetime_format=True)
data.set_index('date', append=True, inplace=True)
data.reset_index('year', inplace=True, drop=True)
data.sort_index(inplace=True)

print(data.head())
print(data.index.get_level_values('date')[:3])

                        Inflation  Internet users  GDP per capita
country     date                                                 
Afghanistan 2000-01-01        nan             nan             nan
            2001-01-01        nan            0.00          119.90
            2002-01-01        nan            0.00          192.15
            2003-01-01        nan            0.09          203.65
            2004-01-01        nan            0.11          224.91
DatetimeIndex(['2000-01-01', '2001-01-01', '2002-01-01'], dtype='datetime64[ns]', name='date', freq=None, tz=None)


- Compute average inflation and GDP per capita over time for each country. Drop missing values. Sort index by inflation. Print the first five rows.

In [7]:
aggregated = data.groupby(level=['country']).mean()[['Inflation', 'GDP per capita']].dropna()
aggregated.sort_index(by='Inflation', inplace=True)

print(aggregated.head())

                      Inflation  GDP per capita
country                                        
Japan                     -0.03        37538.90
Brunei Darussalam          0.63        29596.88
Switzerland                0.64        61883.90
Hong Kong SAR, China       1.26        30344.85
Sweden                     1.31        46526.94


- Create a `DataFrame` which contains only five highest and five lowest inflation values and country index. Print it.

In [8]:
table = pd.concat([aggregated.head(5), aggregated.tail(5)])[['Inflation']]

print(table)

                      Inflation
country                        
Japan                     -0.03
Brunei Darussalam          0.63
Switzerland                0.64
Hong Kong SAR, China       1.26
Sweden                     1.31
Serbia                    19.71
Venezuela, RB             34.21
Belarus                   35.27
Angola                    57.15
Congo, Dem. Rep.          73.39


- Categorize countries into 'high', 'medium', and 'low' groups according to the average value of 'GDP per capita'.
- Compute the average inflation for each group. Print the result.

In [9]:
labels = ['low', 'medium', 'high']
aggregated['GDP group'] = pd.qcut(aggregated['GDP per capita'], 3, labels=labels)
table = aggregated.groupby('GDP group')[['Inflation']].mean()

print(table)

           Inflation
GDP group           
low             8.60
medium          6.65
high            3.61


- Create a `DataFrame` with
    - two-level index, 'GDP group' and 'country',
    - two columns, 'Inflation' and 'GDP per capita'.
- Leave only two smallest and two largest values of inflation values in each 'GDP group'. Print the result.

In [10]:
table = aggregated.groupby('GDP group').apply(lambda x: pd.concat([x.head(2), x.tail(2)]))
table.drop('GDP group', axis=1, inplace=True)

print(table)

                             Inflation  GDP per capita
GDP group country                                     
low       Senegal                 1.69          845.85
          Zimbabwe                2.15          583.54
          Guinea                 18.56          402.39
          Congo, Dem. Rep.       73.39          287.66
medium    Morocco                 1.63         2283.89
          Dominica                1.85         6088.47
          Belarus                35.27         4404.44
          Angola                 57.15         2885.26
high      Japan                  -0.03        37538.90
          Brunei Darussalam       0.63        29596.88
          Turkey                 18.37         7819.24
          Venezuela, RB          34.21         8675.05


- Compute log percentage growth rate of GDP per capita. Print first five rows.

In [11]:
table = data.copy()

def growth(series):
    return 100 * (np.log(series) - np.log(series.shift()))

table['GDP growth'] = table.groupby(level='country')['GDP per capita'].apply(growth)

print(table.head())

                        Inflation  Internet users  GDP per capita  GDP growth
country     date                                                             
Afghanistan 2000-01-01        nan             nan             nan         nan
            2001-01-01        nan            0.00          119.90         nan
            2002-01-01        nan            0.00          192.15       47.16
            2003-01-01        nan            0.09          203.65        5.81
            2004-01-01        nan            0.11          224.91        9.93


- Compute averages of all indicators over time for each country. Print first five rows.

In [12]:
table = table.groupby(level='country').mean()

print(table.head())

                Inflation  Internet users  GDP per capita  GDP growth
country                                                              
Afghanistan          8.13            2.68          407.55       13.11
Albania              2.64           24.40         3206.71        9.67
Algeria              3.71            8.66         3707.43        8.15
American Samoa        nan             nan             nan         nan
Andorra               nan           57.61        37094.66        5.32


- Categorize countries into 'high', 'medium', and 'low' groups according to the average value of 'Internet users' and 'Inflation' (separately).
- Compute the average GDP growth for each group.
- Print the result as a 3x3 table with inflation gropus in rows and internet groups in columns.

In [13]:
labels = ['low', 'medium', 'high']
table['Internet users group'] = pd.qcut(table['Internet users'], 3, labels=labels)
table['Inflation group'] = pd.qcut(table['Inflation'], 3, labels=labels)

table = table.groupby(['Internet users group', 'Inflation group'])['GDP growth'].mean()
table = table.unstack('Internet users group')

print(table)

Internet users group    low  medium   high
Inflation group                           
low                    5.72    6.87   5.07
medium                 7.48    7.96   7.29
high                   7.48    9.32   8.34
