In [1]:
import pandas as pd
import numpy as np

The file `gapminder_with_missing.tsv` contains missing values indicated by `-1`

In [2]:
data = pd.read_csv('data/gapminder_with_missing.tsv', sep='\t')

In [10]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,-1.0
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


Calculate the average `lifeExp`, `pop`, and `gdpPercap`

In [5]:
data['lifeExp'].mean()

55.81203833333336

In [6]:
sum(data['lifeExp']) / len(data['lifeExp'])

55.81203833333336

In [7]:
data['pop'].mean()

27992515.794014085

In [8]:
data['gdpPercap'].mean()

6833.852739445185

The values you calculated are not accurate because `-1` was included in the calculation. However, `-1` is used in the file to indicate the entry is missing. Let us deal with that.

Replace all `-1` with `NaN` *HINT* uses the function replace, i.e., `data.replace(...)`

In [28]:
data2 = data.replace(-1, np.nan)

Calculate the average `lifeExp`, `pop`, and `gdpPercap` again

In [14]:
data2['lifeExp'].mean()

59.391586600124796

In [15]:
data2['pop'].mean()

29774810.870786518

In [16]:
data2['gdpPercap'].mean()

7201.590641938526

In [31]:
sum(data2['lifeExp'].dropna()) / len(data2['lifeExp'].dropna())

59.391586600124796

In [34]:
data2['lifeExp'].dropna().mean()

59.391586600124796

In [37]:
#data2.dropna()['lifeExp'].mean()

What are the rows that contain missing values?
- Hint, think of comparing the indices of `data` and `data.dropna()`

In [40]:
# solution 1
rm = set()
cm = set()
for i in data2.index:
    for c in data2.columns:
        v = data2.loc[i, c]
        if pd.isnull(v):
            rm.add(i)
            cm.add(c)

In [44]:
#data2.loc[rm]

In [45]:
cm

{'gdpPercap', 'lifeExp', 'pop'}

What are the cols that contain missing values?

In [46]:
# solution 2

In [50]:
data2.index.difference(data2.dropna().index)

Int64Index([   2,    7,   29,   32,   39,   42,   45,   58,   61,   68,
            ...
            1648, 1649, 1655, 1662, 1663, 1666, 1673, 1677, 1689, 1691],
           dtype='int64', length=279)

In [52]:
data2.columns.difference(data2.dropna(axis=1).columns)

Index(['gdpPercap', 'lifeExp', 'pop'], dtype='object')

In [53]:
cm

{'gdpPercap', 'lifeExp', 'pop'}

In [54]:
# solution 3

In [55]:
data2.isnull()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
1699,False,False,False,False,False,False
1700,False,False,False,False,False,False
1701,False,False,False,False,False,False
1702,False,False,False,False,False,False


In [66]:
s = data2.isnull().any()
s[s].index

Index(['lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [68]:
s = data2.isnull().any(axis=1)
s[s].index

Int64Index([   2,    7,   29,   32,   39,   42,   45,   58,   61,   68,
            ...
            1648, 1649, 1655, 1662, 1663, 1666, 1673, 1677, 1689, 1691],
           dtype='int64', length=279)

In [56]:
any([True, False, False])

True

In [57]:
any([False, False, False])

False

In [59]:
all([True, True, True])

True