# 02d - Preprocess the `development` dataset - Imputations

__Goal__: Replace the outliers detected previously by missing values. Impute them and the original ones with suited methods.

1. Read the `development` dataset `weather_dataset_raw_development_columns.pkl`;
2. Read the outliers `outlier_indices_dict.pkl` of the numerical variables of the `development` dataset;
3. Replace the relevant outliers only, by missing values;
4. Impute all original and  outlier-generated missing values with suited methods:
- Impute numerical_variables with the `linear` interpolation method
- Impute `Weather` with the `nearest` interpolation method
6. Save the fully preprocessed `develpment` dataset as `weather_dataset_development.pkl`, and remove `weather_dataset_raw_development_columns.pkl`; 
7. Test the interpolation methods

### Import

In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
from pathlib import Path

In [2]:
artifacts_dir = Path('../artifacts')
data_dir = Path('../data')

### Utilities

In [3]:
def convert_column_datatype_into_int(df, column_name):
    df[column_name] = df[column_name].astype(int)

In [4]:
def display_the_number_of_missing_values_per_variable(df):
    print(df.isna().sum())

def display_the_overall_number_of_missing_values(df):
    print(df.isna().sum().sum())

In [5]:
def display_the_rows_with_at_least_one_missing_value(df):
    at_least_one_nan_index = df.index[df.isnull().any(axis=1)]
    display(df.loc[at_least_one_nan_index])

def display_the_rows_with_fully_missing_values(df):
    all_nan_index = df.index[df.isnull().all(axis=1)]
    display(df.loc[all_nan_index])

In [6]:
def display_outlier_series(outlier_series_dict):
    for variable_name in outlier_series_dict.keys():
        print(f'Timestamps of the outliers of "{variable_name}":')
        print("="*len(f'Timestamps of the outliers of "{variable_name}":')+"\n")  
        print(outlier_series_dict[variable_name], "\n")       

In [7]:
def replace_outliers_by_NaNs(df, selected_outlier_series_keys, outlier_series_dict):
    no_outlier_df = df.copy()
    for key in selected_outlier_series_keys:
        no_outlier_df.loc[outlier_series_dict[key].index, key] = np.nan
    return no_outlier_df

In [8]:
def display_the_rows_with_missing_value_at_column(column_name, outlier_series_dictionary_of_df, df):
    display(df.loc[outlier_series_dictionary_of_df[column_name].index])

In [9]:
def impute_missing_numerical_variables_with_linear_interpolation(df, numerical_variables):
    clean_df = df.copy()
    for variable in numerical_variables:
        clean_df[variable+"_linear"] = clean_df[variable].interpolate(method="linear")
    for variable in numerical_variables:
        clean_df.drop([variable], axis=1, inplace=True)
        clean_df.rename(columns={variable+"_linear": variable}, inplace=True)
    return clean_df

In [10]:
def impute_missing_categorical_variables_with_nearest_interpolation(df, categorical_variables):
    clean_df = df.copy()
    for variable in categorical_variables:
        clean_df[variable+"_nearest"] = clean_df[variable].interpolate(method="nearest")
    for variable in categorical_variables:
        clean_df.drop([variable], axis=1, inplace=True)
        clean_df.rename(columns={variable+"_nearest": variable}, inplace=True)
    return  clean_df

# 1. Read the `development` dataset

In [11]:
df = pd.read_pickle(data_dir/'weather_dataset_raw_development_columns.pkl')
df.head()

Unnamed: 0,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather
2006-01-01 00:00:00+00:00,1.161111,0.85,16.6152,139,9.9015,1016.15,0.0
2006-01-01 01:00:00+00:00,1.666667,0.82,20.2538,140,9.9015,1015.87,0.0
2006-01-01 02:00:00+00:00,1.711111,0.82,14.49,140,9.9015,1015.56,0.0
2006-01-01 03:00:00+00:00,1.183333,0.86,13.9426,134,9.9015,1014.98,0.0
2006-01-01 04:00:00+00:00,1.205556,0.85,15.9068,149,9.982,1014.08,0.0


# 2. Read the outliers of the numerical variables of the `development` dataset 

In [12]:
with open(artifacts_dir/'outlier_series_dict.pkl', 'rb') as f:
    outlier_series_dictionary = pickle.load(f)
    
display_outlier_series(outlier_series_dictionary)

Timestamps of the outliers of "Temperature":

Series([], Freq: H, Name: Temperature, dtype: float64) 

Timestamps of the outliers of "Humidity":

2008-02-17 13:00:00+00:00    0.0
2009-01-25 10:00:00+00:00    0.0
2009-12-20 20:00:00+00:00    0.0
2009-12-20 22:00:00+00:00    0.0
2009-12-21 05:00:00+00:00    0.0
Name: Humidity, dtype: float64 

Timestamps of the outliers of "Wind_speed":

2006-01-22 16:00:00+00:00    33.2304
2006-01-22 18:00:00+00:00    33.4880
2006-01-22 21:00:00+00:00    32.6025
2006-01-22 23:00:00+00:00    31.5721
2006-03-04 04:00:00+00:00    38.7044
                              ...   
2010-12-10 03:00:00+00:00    38.0121
2010-12-10 08:00:00+00:00    34.1481
2010-12-10 09:00:00+00:00    32.8923
2010-12-10 11:00:00+00:00    32.8440
2010-12-10 12:00:00+00:00    31.7653
Name: Wind_speed, Length: 635, dtype: float64 

Timestamps of the outliers of "Wind_bearing":

Series([], Freq: H, Name: Wind_bearing, dtype: int64) 

Timestamps of the outliers of "Visibility":

2006-01-

# 3. Replace the relevant outliers only, by `NaN`s

In [13]:
relevant_outlier_series_keys = ["Humidity", "Pressure"] 
no_outlier_df = replace_outliers_by_NaNs(df, relevant_outlier_series_keys, outlier_series_dictionary)

In [14]:
display_the_rows_with_missing_value_at_column("Humidity", outlier_series_dictionary, no_outlier_df)

Unnamed: 0,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather
2008-02-17 13:00:00+00:00,-1.111111,,4.4275,12,9.982,,1.0
2009-01-25 10:00:00+00:00,1.111111,,20.93,350,0.161,1000.1,0.0
2009-12-20 20:00:00+00:00,-15.0,,3.22,250,9.982,1015.1,1.0
2009-12-20 22:00:00+00:00,-15.555556,,6.44,160,9.982,1015.9,1.0
2009-12-21 05:00:00+00:00,-13.888889,,22.54,160,9.982,1016.8,1.0


In [15]:
display_the_rows_with_missing_value_at_column("Pressure", outlier_series_dictionary, no_outlier_df)

Unnamed: 0,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather
2006-01-04 10:00:00+00:00,2.250000,0.92,12.2038,13,11.1251,,0.0
2006-01-05 17:00:00+00:00,3.938889,0.92,7.9373,324,6.1985,,0.0
2006-01-05 19:00:00+00:00,2.827778,1.00,8.1305,344,4.2021,,0.0
2006-01-10 13:00:00+00:00,2.927778,0.60,7.8085,301,9.9820,,0.0
2006-01-10 14:00:00+00:00,2.877778,0.61,4.6046,320,9.9820,,0.0
...,...,...,...,...,...,...,...
2010-12-13 10:00:00+00:00,1.111111,0.78,22.3146,320,11.2700,,0.0
2010-12-13 11:00:00+00:00,1.111111,0.75,17.6134,329,11.2700,,0.0
2010-12-13 13:00:00+00:00,0.050000,0.91,7.8246,35,11.2700,,0.0
2010-12-13 14:00:00+00:00,0.000000,0.91,7.4543,54,11.2056,,1.0


# 4. Impute all original and  outlier-generated `NaN`s with suited methods

## A. Display the rows with fully missing values

In [16]:
display_the_rows_with_fully_missing_values(no_outlier_df)

Unnamed: 0,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather


## B. Display the rows with at least one missing value

In [17]:
display_the_rows_with_at_least_one_missing_value(no_outlier_df)

Unnamed: 0,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather
2006-01-04 10:00:00+00:00,2.250000,0.92,12.2038,13,11.1251,,0.0
2006-01-05 17:00:00+00:00,3.938889,0.92,7.9373,324,6.1985,,0.0
2006-01-05 19:00:00+00:00,2.827778,1.00,8.1305,344,4.2021,,0.0
2006-01-10 13:00:00+00:00,2.927778,0.60,7.8085,301,9.9820,,0.0
2006-01-10 14:00:00+00:00,2.877778,0.61,4.6046,320,9.9820,,0.0
...,...,...,...,...,...,...,...
2010-12-13 10:00:00+00:00,1.111111,0.78,22.3146,320,11.2700,,0.0
2010-12-13 11:00:00+00:00,1.111111,0.75,17.6134,329,11.2700,,0.0
2010-12-13 13:00:00+00:00,0.050000,0.91,7.8246,35,11.2700,,0.0
2010-12-13 14:00:00+00:00,0.000000,0.91,7.4543,54,11.2056,,1.0


## C. Display the number of missing values per variable

In [18]:
display_the_number_of_missing_values_per_variable(df)

Temperature     0
Humidity        0
Wind_speed      0
Wind_bearing    0
Visibility      0
Pressure        0
Weather         5
dtype: int64


## D. Display the overall number of missing  values

In [19]:
display_the_overall_number_of_missing_values(df)

5


## E. Impute `numerical_variables` with the `linear` interpolation method

In [20]:
numerical_variables = list(df.columns); numerical_variables.remove("Weather")

num_clean_df = impute_missing_numerical_variables_with_linear_interpolation(df, numerical_variables)
display_the_number_of_missing_values_per_variable(num_clean_df)

Weather         5
Temperature     0
Humidity        0
Wind_speed      0
Wind_bearing    0
Visibility      0
Pressure        0
dtype: int64


## F. Impute `Weather` with the `nearest` interpolation method

In [21]:
categorical_variables = ["Weather"]

clean_df = impute_missing_categorical_variables_with_nearest_interpolation(num_clean_df, categorical_variables)
display_the_number_of_missing_values_per_variable(clean_df)

Temperature     0
Humidity        0
Wind_speed      0
Wind_bearing    0
Visibility      0
Pressure        0
Weather         0
dtype: int64


## G. Check the `NaN` absence in the `development` dataset

In [22]:
display_the_overall_number_of_missing_values(clean_df)

0


# 5. Save the fully preprocessed `develpment` dataset

In [23]:
clean_df.to_pickle(data_dir / 'clean_weather_dataset.pkl')

In [24]:
os.remove(data_dir/'weather_dataset_raw_development_columns.pkl')