In [16]:
import pandas as pd
import numpy as np

The file `gapminder_with_corrupt.tsv` contains the same gapminder data but some numerical entries are corrupt

In [17]:
data = pd.read_csv('data/gapminder_with_corrupt.tsv', sep='\t')

In [18]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.4453145
1,Afghanistan,Asia,1957,30.332,9240934,820.8530296nn
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.1971382
4,Afghanistan,Asia,1972,36.088,13079460,739.9811058


Calculate the average `lifeExp`, `pop`, and `gdpPercap`

You will need to detect the corrupt values and exclude them from the calculation

*HINT* replace the corrupt values with NaN

In [19]:
data.dtypes

country      object
continent    object
year          int64
lifeExp      object
pop          object
gdpPercap    object
dtype: object

### Solution 1: manual detection and conversion of corrupt values

In [20]:
def is_number(n):
    for c in n:
        if c not in '0123456789.':
            return False
    return True #

In [21]:
def convert_to_number(n):
    if is_number(n):
        return float(n)
    else:
        return np.nan

In [22]:
data['lifeExp']=data['lifeExp'].apply(convert_to_number)

In [23]:
data['pop']=data['pop'].apply(convert_to_number)

In [24]:
data['gdpPercap']=data['gdpPercap'].apply(convert_to_number)

In [25]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333.0,779.445314
1,Afghanistan,Asia,1957,30.332,9240934.0,
2,Afghanistan,Asia,1962,31.997,10267083.0,853.10071
3,Afghanistan,Asia,1967,34.02,11537966.0,836.197138
4,Afghanistan,Asia,1972,36.088,13079460.0,739.981106


In [26]:
data.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop          float64
gdpPercap    float64
dtype: object

In [27]:
data['lifeExp'].mean()

59.443401121049476

In [28]:
data['pop'].mean()

29361346.398798797

In [29]:
data['gdpPercap'].mean()

7186.243429043534

In [30]:
sum(data['gdpPercap'].dropna()) / len(data['gdpPercap'].dropna())

7186.243429043534

### Solution 2: using Pandas `to_numeric` function

In [31]:
data['lifeExp'] = pd.to_numeric(data['lifeExp'], errors='coerce')
data['pop'] = pd.to_numeric(data['pop'], errors='coerce')
data['gdpPercap'] = pd.to_numeric(data['gdpPercap'], errors='coerce')




In [32]:
data['lifeExp'].mean()

59.443401121049476

In [33]:
data['pop'].mean()

29361346.398798797

In [34]:
data['gdpPercap'].mean()

7186.243429043534