# Imputation

In [None]:
import numpy as np
import pandas as pd

### Diabetes dataset

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
df.info()

## Explore the bad values

In [None]:
# Note the min values

df.describe()

In [None]:
df["insulin"].describe()

In [None]:
# First 10 rows

df["insulin"].head(10)

---

# The benefit of setting bad values to NaN for imputation

## Set bad "insulin" values to NAN

In [None]:
# Replace the zeros with np.nan (Not a Number)

df.loc[df["insulin"]==0, "insulin"] = np.nan

df["insulin"].head(10)

In [None]:
df["insulin"].describe()

# Impute the bad values

In [None]:
# Set the null (NaN) insulin values equal to the mean insulin value:

df.loc[df["insulin"].isnull(), "insulin"] = df["insulin"].mean()

In [None]:
df["insulin"].head(10)

---

# More Refined Imputation

#### Reset the data

In [None]:
df = pd.read_csv("diabetes.csv")

## Set missing values to NAN

In [None]:
# Replace the zeros with np.nan (Not a Number)

df.loc[df["insulin"]==0, "insulin"] = np.nan

# First 10 rows
df["insulin"].head(10)

## Evaluate age

In [None]:
# Approximatel 75% of individuals are under 40 years old

df['age'].describe()

## Obtain mean insulin value by age range

In [None]:
# Mean insulin level under 41

df.loc[df["age"] <= 40, "insulin"].mean()

In [None]:
# Mean insulin level over 40

df.loc[df["age"] > 40, "insulin"].mean()

## Impute the bad values

In [None]:
# Impute missing insulin values for samples under 41 years old

df.loc[(df["age"] <= 40) & (df["insulin"].isnull()), "insulin"] = df.loc[df["age"] <= 40, "insulin"].mean()

In [None]:
# Impute missing insulin values for samples over 40 years old

df.loc[(df["age"] > 40) & (df["insulin"].isnull()), "insulin"] = df.loc[df["age"]> 40, "insulin"].mean()

#### View the imputation

In [None]:
df[["age", "insulin"]].head(10)

---

---

# Imputation using sklearn's SimpleImputer

In [None]:
from sklearn.impute import SimpleImputer

# Set what determines a bad/missing value, set how to replace that value
fill_nan = SimpleImputer(missing_values=np.nan, strategy="mean")

In [None]:
# See default values

SimpleImputer # use shift+tab

### Impute missing insulin values for samples under 41 years old

In [None]:
# "fit" calculates the replacement value, "transform" replaces the bad values with the replacement value

df.loc[df["age"] <= 40, "insulin"] = fill_nan.fit_transform(df.loc[df["age"] <= 40, "insulin"].values.reshape(-1, 1))

### Impute missing insulin values for samples over 40 years old

In [None]:
df.loc[df["age"] > 40, "insulin"] = fill_nan.fit_transform(df.loc[df["age"] > 40, "insulin"].values.reshape(-1, 1))

#### View the imputation

In [None]:
df[["age", "insulin"]].head(10)

---

## Explanation of the imputation's reshape()

In [None]:
# Returns a Series

df.loc[df["age"] > 40, "insulin"]

In [None]:
# Returns an array of numbers (1-dimensional)

df.loc[df["age"] > 40, "insulin"].values

In [None]:
# reshaped to be (2-dimensional) rows x columns 

df.loc[df["age"] > 40, "insulin"].values.reshape(-1,1)

---