In [None]:
# Version:06.09.2023
import os
import requests
import datetime
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import mahalanobis
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
# This HouseZero_Data_Processing_Code.py file was generated on 2023-06-09 by Jung Min Han of CGBC

# Modify the following file path for your local Drive
FILEPATH = r"C:\Users\ln\Desktop\HouseZeroData"
df_outliers = pd.read_csv(FILEPATH + "Slab_temp_Year1.csv")

def calculate_covariance_matrix(data):
    """
    Calculates the covariance matrix of the given data.

    Args:
        data (array-like): The input data. Each row represents a variable, and each column represents an observation.

    Returns:
        cov_matrix (ndarray): The covariance matrix of the data. The matrix is square with shape (N, N), where N is the number of variables.
                              The element at position (i, j) represents the covariance between variable i and variable j.
    """
    return np.cov(data, rowvar=False)

def calculate_inverse_covariance_matrix(cov_matrix):
    """
    Calculates the inverse covariance matrix.

    Args:
        cov_matrix (ndarray): The covariance matrix to calculate the inverse for. The matrix must be square and positive definite.

    Returns:
        inverse_cov_matrix (ndarray): The inverse of the covariance matrix. The matrix is square with the same shape as the input matrix.
                                      The element at position (i, j) represents the inverse covariance between variable i and variable j.
    """
    return np.linalg.inv(cov_matrix)

def calculate_mahalanobis_distance(x, mean, inv_cov):
    """
    Calculates the Mahalanobis distance between a data point and a distribution with given mean and inverse covariance matrix.

    Args:
        x (array-like): The data point for which to calculate the Mahalanobis distance.
        mean (array-like): The mean of the distribution.
        inv_cov (ndarray): The inverse covariance matrix of the distribution.

    Returns:
        distance (float): The Mahalanobis distance between the data point and the distribution.
    """
    x_minus_mean = np.array(x - mean)
    invalid = np.isnan(x_minus_mean) | np.isinf(x_minus_mean)
    mean_value = np.nanmean(x_minus_mean)
    x_minus_mean[invalid] = mean_value
    return np.sqrt(np.dot(np.dot(x_minus_mean, inv_cov), x_minus_mean))

def calculate_z_score(x, data):
    """
    Calculates the Z-score of a data point given the data.

    Args:
        x (float): The data point for which to calculate the Z-score.
        data (array-like): The data used to calculate the mean and standard deviation.

    Returns:
        z_score (float): The Z-score of the data point.
    """
    mean = np.mean(data)
    std = np.std(data)
    z_score = (x - mean) / std
    return z_score

def filter_outliers_z_score(data, threshold=3):
    """
    Filters outliers from the data using Z-score.

    Args:
        data (array-like): The input data.
        threshold (float): The threshold value to define outliers. Data points with absolute Z-scores greater than the threshold are considered outliers.

    Returns:
        filtered_data (array-like): The data with outliers filtered out.
    """
    z_scores = calculate_z_score(data, data)
    filtered_data = data[abs(z_scores) <= threshold]
    return filtered_data

def filter_outliers_diff(data, res=3):
    """
    Filters outliers from the data based on the absolute difference.

    Args:
        data (pandas.DataFrame): The input data.
        res (float): The threshold value to define outliers. Rows with absolute differences greater than res are considered outliers.

    Returns:
        filtered_data (pandas.DataFrame): The data with outliers filtered out and interpolated.
    """
    # Filtering rows based on the absolute difference greater than res
    diffIdx = data[abs(data.diff()) > res].index
    # Dropping the filtered rows from the DataFrame
    data = data.drop(diffIdx)
    filtered_data = data.interpolate()
    return filtered_data

def filter_outliers_mahalanobis_distance(data, mean, inv_cov, threshold=3):
    """
    Filters outliers from the data based on Mahalanobis distance.

    Args:
        data (pandas.DataFrame): The input data.
        mean (array-like): The mean of the distribution.
        inv_cov (ndarray): The inverse covariance matrix of the distribution.
        threshold (float): The threshold value to define outliers. Data points with Mahalanobis distance greater than the threshold are considered outliers.

    Returns:
        filtered_data (pandas.DataFrame): The data with outliers filtered out.
    """
    dist = [calculate_mahalanobis_distance(data.iloc[i], mean, inv_cov) for i in range(len(data))]
    filtered_data = data[dist <= threshold]
    return filtered_data

def plot_slab_temp_with_outliers(data, zone_col):
    """
    Plots the slab temperature data with outliers for a specific zone column.

    Args:
        data (pandas.DataFrame): The input data.
        zone_col (str): The column representing the zone for which to plot the slab temperature.

    Returns:
        None
    """
    plt.figure(figsize=(15, 8))

    mean = np.mean(data[zone_col])
    std = np.std(data[zone_col])

    # Calculate the Mahalanobis Distance for each data point
    dist = [calculate_mahalanobis_distance(data[zone_col].iloc[i], mean, inv_cov_matrix) for i in range(len(data))]
    # Identify the outliers
    outliers = np.where(dist > 3 * std)[0]
    # Plot the slab temperature data
    plt.plot(data.index, data[zone_col], 'o', markersize=2, label="Data")

    # Plot the outliers
    if len(outliers) > 0:
        plt.plot(data.iloc[outliers].index, data.iloc[outliers][zone_col], 'ro', markersize=4, label="Outliers")

    plt.xlabel("Index")
    plt.ylabel("Title")
    plt.title(zone_col + " Slab Temperature")
    plt.legend()

    plt.show()

def impute_missing_values_knn(data, k=3):
    """
    Imputes missing values in the data using KNN imputation.

    Args:
        data (pandas.DataFrame): The input data with missing values.
        k (int): The number of nearest neighbors to consider when imputing missing values.

    Returns:
        filled_data (pandas.DataFrame): The data with missing values imputed using KNN imputation.
    """
    imputer = KNNImputer(n_neighbors=k)
    filled_data = imputer.fit_transform(data)
    filled_data = pd.DataFrame(filled_data, columns=data.columns)
    return filled_data

def impute_missing_values(data):
    """
    Imputes missing values in the data using linear interpolation.

    Args:
        data (pandas.DataFrame): The input data with missing values.

    Returns:
        filled_data (pandas.DataFrame): The data with missing values imputed using linear interpolation.
    """
    filled_data = data.interpolate()
    return filled_data

def impute_missing_values_rf(data, target_col):
    """
    Imputes missing values in the data using Random Forest Regression with grid search.

    Args:
        data (pandas.DataFrame): The input data with missing values.
        target_col (str): The column representing the target variable with missing values.

    Returns:
        imputed_data (pandas.DataFrame): The data with missing values imputed using Random Forest Regression.
    """
    # Split the data into two subsets: one with missing values and one without
    missing_data = data[data[target_col].isnull()]
    non_missing_data = data[~data[target_col].isnull()]

    # Prepare the training data and target variable
    X_train = non_missing_data.drop(target_col, axis=1)
    y_train = non_missing_data[target_col]

    # Define the parameter grid for grid search
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create a Random Forest Regression model
    rf_model = RandomForestRegressor()
    # Create a GridSearchCV object with the model and parameter grid
    grid_search = GridSearchCV(rf_model, param_grid=param_grid, cv=5)
    # Fit the GridSearchCV on the training data
    grid_search.fit(X_train, y_train)
    # Get the best model from the grid search
    best_rf_model = grid_search.best_estimator_
    # Predict the missing values using the best model
    missing_data[target_col] = best_rf_model.predict(missing_data.drop(target_col, axis=1))
    # Combine the imputed data with the non-missing data
    imputed_data = pd.concat([missing_data, non_missing_data])

    return imputed_data

    
def main():
    df = df_outliers  # Assuming df_outliers is defined and contains the necessary data

    # Calculate the covariance matrix
    cov_matrix = calculate_covariance_matrix(df[["Z1_slab_temp", "Z2_slab_temp"]])
    # Calculate the inverse covariance matrix
    inv_cov_matrix = calculate_inverse_covariance_matrix(cov_matrix)
    # Plot slab temperature data with outliers for Z1_slab_temp
    plot_slab_temp_with_outliers(df, "Z1_slab_temp")
    # Plot slab temperature data with outliers for Z2_slab_temp
    plot_slab_temp_with_outliers(df, "Z2_slab_temp")

    # Example usage of Z-score outlier filtering
    data = df["Z1_slab_temp"]  # Example data
    filtered_data = filter_outliers_z_score(data, threshold=3)
    print("Filtered data:", filtered_data)

    # Impute missing values using Random Forest Regression with grid search
    imputed_df = impute_missing_values_rf(df, "Z1_slab_temp")
    print("Imputed data:", imputed_df)

    
if __name__ == "__main__":
    main()