# Missing values

### Importacao bibliotecas

In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Trabalhando com os dados

In [None]:
df = pd.DataFrame(
    np.random.randn(5, 3),
    index=["a", "c", "e", "f", "h"],
    columns=["one", "two", "three"],
)

In [None]:
df["four"] = "bar"

In [None]:
df["five"] = df["one"] > 0
df

In [None]:
df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
df2

In [None]:
df2["one"]

In [None]:
pd.isna(df2["one"])

In [None]:
df2["four"].notna()

In [None]:
df2.isna()

In [None]:
df2["one"] == np.nan

### Integer dtypes and missing data

In [None]:
pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())

### Datetimes

In [None]:
df2 = df.copy()

In [None]:
df2["timestamp"] = pd.Timestamp("20120101")
df2

In [None]:
df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan
df2

In [None]:
df2.dtypes.value_counts()

### Inserting missing data

In [None]:
s = pd.Series([1, 2, 3])

In [None]:
s.loc[0] = None
s

In [None]:
s = pd.Series(["a", "b", "c"])

In [None]:
s.loc[0] = None
s.loc[1] = np.nan
s

### Calculations with missing data

In [None]:
df2

In [None]:
df2["one"].sum()

In [None]:
df2.mean(1)

In [None]:
df2.cumsum().iloc[:,0:3]

In [None]:
df2.cumsum(skipna=False).iloc[:,0:3]

### Sum/prod of empties/nans¶

##### The sum of an empty or all-NA Series or column of a DataFrame is 0.

In [None]:
pd.Series([np.nan]).sum()

In [None]:
pd.Series([], dtype="float64").sum()

##### The product of an empty or all-NA Series or column of a DataFrame is 1.

In [None]:
pd.Series([np.nan]).prod()

In [None]:
pd.Series([], dtype="float64").prod()

### NA values in GroupBy

In [None]:
df2

In [None]:
df2.groupby("one").mean()

### Filling missing values: fillna

In [None]:
df2.fillna(0)

In [None]:
df2["one"].fillna("missing")

In [None]:
df2

In [None]:
df2.fillna(method="pad")

In [None]:
df2.fillna(method="pad", limit=1)

### Filling with a PandasObject

In [None]:
dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC"))

In [None]:
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan
dff

In [None]:
dff.fillna(dff.mean())

In [None]:
dff.fillna(dff.mean()["B":"C"])

In [None]:
dff.where(pd.notna(dff), dff.mean(), axis="columns")

### Dropping axis labels with missing data: dropna

In [None]:
df.iloc[:,0] = np.nan
df

In [None]:
df.dropna(axis=0)

In [None]:
df.dropna(axis=1)

### Interpolation

In [None]:
df = pd.DataFrame(
    {
        "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8],
        "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4],
    }
)

In [None]:
df

In [None]:
df.interpolate()

In [None]:
df.interpolate(method="barycentric")

In [None]:
df.interpolate(method="pchip")

In [None]:
df.interpolate(method="akima")

In [None]:
df.interpolate(method="spline", order=2)

In [None]:
df.interpolate(method="polynomial", order=2)

##### Methods comparasion

In [None]:
import matplotlib.pyplot as plt

np.random.seed(2)
ser = pd.Series(np.arange(1, 10.1, 0.25) ** 2 + np.random.randn(37))
missing = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29])
ser[missing] = np.nan
methods = ["linear", "quadratic", "cubic"]
df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods})
df.plot()
plt.show()

In [None]:
# interplocacao de novos valores
ser = pd.Series(np.sort(np.random.uniform(size=100)))
new_index = ser.index.union(pd.Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]))
interp_s = ser.reindex(new_index).interpolate(method="pchip")
interp_s[49:51]

##### Interpolation limits

In [None]:
ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan])
ser

In [None]:
ser.interpolate() # fill all consecutive values in a forward direction

In [None]:
 ser.interpolate(limit=1) # fill one consecutive value in a forward direction

In [None]:
ser.interpolate(limit=1, limit_direction="backward") # fill one consecutive value backwards

In [None]:
ser.interpolate(limit=1, limit_direction="both") # fill one consecutive value in both directions 

In [None]:
ser.interpolate(limit_direction="both") # fill all consecutive values in both directions

In [None]:
ser.interpolate(limit_direction="both", limit_area="inside", limit=1) # fill one consecutive inside value in both directions

In [None]:
ser.interpolate(limit_direction="backward", limit_area="outside") # fill all consecutive outside values backward

In [None]:
ser.interpolate(limit_direction="both", limit_area="outside") # fill all consecutive outside values in both directions

### Replacing generic values

In [None]:
ser = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0])

In [None]:
ser.replace(0, 5)

In [None]:
ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])

In [None]:
ser.replace({0: 10, 1: 100})

In [None]:
df = pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]})

In [None]:
df.replace({"a": 0, "b": 5}, 100)

In [None]:
ser.replace([1, 2, 3], method="pad")

### String/regular expression replacement

In [None]:
d = {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]}

In [None]:
df = pd.DataFrame(d)
df

In [None]:
df.replace(r"\s*\.\s*", np.nan, regex=True)

In [None]:
df.replace(["a", "."], ["b", np.nan])

In [None]:
df.replace([r"\.", r"(a)"], ["dot", r"\1stuff"], regex=True)

In [None]:
df.replace({"b": "."}, {"b": np.nan})

In [None]:
df.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True)

In [None]:
df.replace({"b": {"b": r""}}, regex=True)

In [None]:
df.replace(regex={"b": {r"\s*\.\s*": np.nan}})

In [None]:
df.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True)

In [None]:
df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True)

In [None]:
df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan)

### Numeric replacement

In [None]:
df = pd.DataFrame(np.random.randn(10, 2))
df

In [None]:
df[np.random.rand(df.shape[0]) > 0.5] = 1.5
df

In [None]:
df.replace(1.5, np.nan)

In [None]:
df00 = df.iloc[0, 0]

In [None]:
df.replace([1.5, df00], [np.nan, "a"])

In [None]:
 df.replace(1.5, np.nan, inplace=True)

### Missing data casting rules and indexing

In [None]:
s = pd.Series(np.random.randn(5), index=[0, 2, 4, 6, 7])

In [None]:
s > 0

In [None]:
crit = (s > 0).reindex(list(range(8)))
crit

In [None]:
crit.dtype

In [None]:
reindexed = s.reindex(list(range(8))).fillna(0)

In [None]:
reindexed[crit.fillna(False)]

In [None]:
reindexed[crit.fillna(True)]