# 02d - Preprocess the `development` dataset - Imputations

__Goal__: Replace the outliers detected previously by missing values. Impute them and the original ones with suited methods.

1. Read the `development` dataset `weather_dataset_raw_development_columns.pkl`;
2. Read the outliers `outlier_indices_dict.pkl` of the numerical variables of the `development` dataset;
3. Replace the relevant outliers only, by missing values;
4. Impute all original and  outlier-generated missing values with suited methods:
- Impute numerical_variables with the `linear` interpolation method
- Impute `Weather` with the `nearest` interpolation method
6. Save the fully preprocessed `develpment` dataset as `weather_dataset_development.pkl`, and remove `weather_dataset_raw_development_columns.pkl`; 
7. Test the interpolation methods

### Import

In [None]:
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
from pathlib import Path

In [None]:
artifacts_dir = Path('./artifacts')
data_dir = Path('../data')

### Utilities

In [None]:
def convert_column_datatype_into_int(df, column_name):
    df[column_name] = df[column_name].astype(int)

In [None]:
def display_the_number_of_missing_values_per_variable(df):
    print(df.isna().sum())

def display_the_overall_number_of_missing_values(df):
    print(df.isna().sum().sum())

In [None]:
def display_the_rows_with_at_least_one_missing_value(df):
    at_least_one_nan_index = df.index[df.isnull().any(axis=1)]
    display(df.loc[at_least_one_nan_index])

def display_the_rows_with_fully_missing_values(df):
    all_nan_index = df.index[df.isnull().all(axis=1)]
    display(df.loc[all_nan_index])

In [None]:
def display_outlier_series(outlier_series_dict):
    for variable_name in outlier_series_dict.keys():
        print(f'Timestamps of the outliers of "{variable_name}":')
        print("="*len(f'Timestamps of the outliers of "{variable_name}":')+"\n")  
        print(outlier_series_dict[variable_name], "\n")       

In [None]:
def replace_outliers_by_NaNs(df, selected_outlier_series_keys, outlier_series_dict):
    no_outlier_df = df.copy()
    for key in selected_outlier_series_keys:
        no_outlier_df.loc[outlier_series_dict[key].index, key] = np.nan
    return no_outlier_df

In [None]:
def display_the_rows_with_missing_value_at_column(column_name, outlier_series_dictionary_of_df, df):
    display(df.loc[outlier_series_dictionary_of_df[column_name].index])

In [None]:
def impute_missing_numerical_variables_with_linear_interpolation(df, numerical_variables):
    clean_df = df.copy()
    for variable in numerical_variables:
        clean_df[variable+"_linear"] = clean_df[variable].interpolate(method="linear")
    for variable in numerical_variables:
        clean_df.drop([variable], axis=1, inplace=True)
        clean_df.rename(columns={variable+"_linear": variable}, inplace=True)
    return clean_df

In [None]:
def impute_missing_categorical_variables_with_nearest_interpolation(df, categorical_variables):
    clean_df = df.copy()
    for variable in categorical_variables:
        clean_df[variable+"_nearest"] = clean_df[variable].interpolate(method="nearest")
    for variable in categorical_variables:
        clean_df.drop([variable], axis=1, inplace=True)
        clean_df.rename(columns={variable+"_nearest": variable}, inplace=True)
    return  clean_df

# 1. Read the `development` dataset

In [None]:
df = pd.read_pickle(data_dir/'weather_dataset_raw_development_columns.pkl')
df.head()

# 2. Read the outliers of the numerical variables of the `development` dataset 

In [None]:
with open(artifacts_dir/'outlier_series_dict.pkl', 'rb') as f:
    outlier_series_dictionary = pickle.load(f)
    
display_outlier_series(outlier_series_dictionary)

# 3. Replace the relevant outliers only, by `NaN`s

In [None]:
relevant_outlier_series_keys = ["Humidity", "Pressure"] 
no_outlier_df = replace_outliers_by_NaNs(df, relevant_outlier_series_keys, outlier_series_dictionary)

In [None]:
display_the_rows_with_missing_value_at_column("Humidity", outlier_series_dictionary, no_outlier_df)

In [None]:
display_the_rows_with_missing_value_at_column("Pressure", outlier_series_dictionary, no_outlier_df)

# 4. Impute all original and  outlier-generated `NaN`s with suited methods

## A. Display the rows with fully missing values

In [None]:
display_the_rows_with_fully_missing_values(no_outlier_df)

## B. Display the rows with at least one missing value

In [None]:
display_the_rows_with_at_least_one_missing_value(no_outlier_df)

## C. Display the number of missing values per variable

In [None]:
display_the_number_of_missing_values_per_variable(df)

## D. Display the overall number of missing  values

In [None]:
display_the_overall_number_of_missing_values(df)

## E. Impute `numerical_variables` with the `linear` interpolation method

In [None]:
numerical_variables = list(df.columns); numerical_variables.remove("Weather")

num_clean_df = impute_missing_numerical_variables_with_linear_interpolation(df, numerical_variables)
display_the_number_of_missing_values_per_variable(num_clean_df)

## F. Impute `Weather` with the `nearest` interpolation method

In [None]:
categorical_variables = ["Weather"]

clean_df = impute_missing_categorical_variables_with_nearest_interpolation(num_clean_df, categorical_variables)
display_the_number_of_missing_values_per_variable(clean_df)

## G. Check the `NaN` absence in the `development` dataset

In [None]:
display_the_overall_number_of_missing_values(clean_df)

# 5. Save the fully preprocessed `develpment` dataset

In [None]:
clean_df.to_pickle(data_dir / 'clean_weather_dataset.pkl')

In [None]:
os.remove(data_dir/'weather_dataset_raw_development_columns.pkl')