**Objetivos**

Este capítulo abordará:
1. O que é um valor ausente.
2. Como os valores ausentes são criados.
3. Como recodificar e fazer cálculos com valores ausentes.

In [28]:
from numpy import NaN, NAN, nan
import numpy as np
import pandas as pd

In [2]:
# NaN não é igual é nada, pois o dado está ausente
NaN == True

False

In [3]:
NaN == False

False

In [4]:
NaN == 0

False

In [5]:
NaN == ""

False

In [6]:
NaN == NaN

False

In [8]:
# testa se um valor é ausente
pd.isnull(NaN)

True

In [9]:
# testa se um valor não é ausente
pd.notnull(NaN)

False

In [11]:
# na função read_csv, três parâmetros estão relacioandos com a leitura de valores ausentes:
# 1. na_values: permite especificar valores ausentes ou NaN adicionais;
# 2. keep_default_na: booleano especifica se algum valor adicional deve ser considerado como ausente;
# 3. na_filter: booleano especifica se algum valor será lido como ausente.
visited_file = pd.read_csv("survey_visited.csv")
visited_file.head(8)

Unnamed: 0,ident,site,dated
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1939-01-07
3,735,DR-3,1930-01-12
4,751,DR-3,1930-02-26
5,752,DR-3,
6,837,MSK-4,1932-01-14
7,844,DR-1,1932-03-22


In [12]:
visited_file_2 = pd.read_csv("survey_visited.csv", keep_default_na = False)
visited_file_2.head(8)

Unnamed: 0,ident,site,dated
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1939-01-07
3,735,DR-3,1930-01-12
4,751,DR-3,1930-02-26
5,752,DR-3,
6,837,MSK-4,1932-01-14
7,844,DR-1,1932-03-22


In [13]:
visited_file_3 = pd.read_csv("survey_visited.csv", 
                             na_values = [""],
                             keep_default_na = False)
visited_file_3.head(8)

Unnamed: 0,ident,site,dated
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1939-01-07
3,735,DR-3,1930-01-12
4,751,DR-3,1930-02-26
5,752,DR-3,
6,837,MSK-4,1932-01-14
7,844,DR-1,1932-03-22


In [14]:
visited = pd.read_csv("survey_visited.csv")
visited

Unnamed: 0,ident,site,dated
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1939-01-07
3,735,DR-3,1930-01-12
4,751,DR-3,1930-02-26
5,752,DR-3,
6,837,MSK-4,1932-01-14
7,844,DR-1,1932-03-22


In [15]:
survey = pd.read_csv("survey_survey.csv")
survey

Unnamed: 0,taken,person,quant,reading
0,619,dyer,rad,9.82
1,619,dyer,sal,0.13
2,622,dyer,rad,7.8
3,622,dyer,sal,0.09
4,734,pb,rad,8.41
5,734,lake,sal,0.05
6,734,pb,temp,-21.5
7,735,pb,rad,7.22
8,735,,sal,0.06
9,735,,temp,-26.0


In [16]:
vs = visited.merge(survey, left_on = "ident", right_on = "taken")
vs

Unnamed: 0,ident,site,dated,taken,person,quant,reading
0,619,DR-1,1927-02-08,619,dyer,rad,9.82
1,619,DR-1,1927-02-08,619,dyer,sal,0.13
2,622,DR-1,1927-02-10,622,dyer,rad,7.8
3,622,DR-1,1927-02-10,622,dyer,sal,0.09
4,734,DR-3,1939-01-07,734,pb,rad,8.41
5,734,DR-3,1939-01-07,734,lake,sal,0.05
6,734,DR-3,1939-01-07,734,pb,temp,-21.5
7,735,DR-3,1930-01-12,735,pb,rad,7.22
8,735,DR-3,1930-01-12,735,,sal,0.06
9,735,DR-3,1930-01-12,735,,temp,-26.0


In [17]:
# valores ausentes em uma Series
num_legs = pd.Series({"goat":4, "amoeba": nan})
num_legs

goat      4.0
amoeba    NaN
dtype: float64

In [18]:
# valores ausentes em um dataframe
scientists = pd.DataFrame({
    "Name": ["Rosaline Franklin", "William Gosset"],
    "Occupation": ["Chemist", "Statistician"],
    "Born": ["1920-07-25", "1876-06-13"],
    "Died": ["1958-04-16", "1937-10-16"],
    "missing": [NaN, nan]})

scientists

Unnamed: 0,Name,Occupation,Born,Died,missing
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,
1,William Gosset,Statistician,1876-06-13,1937-10-16,


In [19]:
# reindexação
gapminder = pd.read_csv("gapminder.tsv", sep = "\t")

life_exp = gapminder.groupby(["year"])["lifeExp"].mean()
life_exp

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [21]:
# obtém um subconjunto
y2000 = life_exp[life_exp.index > 2000]
y2000

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [22]:
y2000.reindex(range(2000, 2010))

year
2000          NaN
2001          NaN
2002    65.694923
2003          NaN
2004          NaN
2005          NaN
2006          NaN
2007    67.007423
2008          NaN
2009          NaN
Name: lifeExp, dtype: float64

In [24]:
ebola = pd.read_csv("country_timeseries.csv")
ebola.count() # número de valores não ausentes por coluna

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64

In [26]:
num_rows = ebola.shape[0]
num_missing = num_rows - ebola.count()
num_missing

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64

In [29]:
# número total de valores ausentes
np.count_nonzero(ebola.isnull())

1214

In [30]:
# número de valores ausentes em uma coluna em particular
np.count_nonzero(ebola["Cases_Guinea"].isnull())

29

In [32]:
# fillna = recodificar valores ausentes
print(ebola.fillna(0).iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            0.0            10030.0
1    1/4/2015  288        2775.0            0.0             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286           0.0         8157.0                0.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0            0.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0            0.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [33]:
# ffill = preenchimento para a frente
print(ebola.fillna(method = "ffill").iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2769.0         8157.0             9722.0
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         8018.0             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7977.0             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [34]:
# bfill = preenchimento para trás
print(ebola.fillna(method = "bfill").iloc[:, 0:5].tail())

          Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
117  3/27/2014    5         103.0            8.0                6.0
118  3/26/2014    4          86.0            NaN                NaN
119  3/25/2014    3          86.0            NaN                NaN
120  3/24/2014    2          86.0            NaN                NaN
121  3/22/2014    0          49.0            NaN                NaN


In [35]:
print(ebola.interpolate().iloc[0:10, 0:5])

         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
0    1/5/2015  289        2776.0            NaN            10030.0
1    1/4/2015  288        2775.0            NaN             9780.0
2    1/3/2015  287        2769.0         8166.0             9722.0
3    1/2/2015  286        2749.5         8157.0             9677.5
4  12/31/2014  284        2730.0         8115.0             9633.0
5  12/28/2014  281        2706.0         8018.0             9446.0
6  12/27/2014  280        2695.0         7997.5             9409.0
7  12/24/2014  277        2630.0         7977.0             9203.0
8  12/21/2014  273        2597.0         7919.5             9004.0
9  12/20/2014  272        2571.0         7862.0             8939.0


In [36]:
# descartando valores ausentes
print(ebola.shape)
print(ebola.dropna().shape)

(122, 18)
(1, 18)


In [39]:
print(ebola["Cases_Guinea"].sum(skipna = False))
print(ebola["Cases_Guinea"].sum(skipna = True)) # default
print(ebola["Cases_Guinea"].sum())

nan
84729.0
84729.0
