This is the example of using EM-algorithm to impute missing data. <br> In this case we use the data of air pollution, and the goal is to perform regression analysis to predict the value of PM2.5.<br>
To simulate the condition of a dataset with missing values, we will pick random rows and replace some features with NaN.<br>
In the end we will compare the R-squared value of EM-algorithm, original data and baseline method(KNN) as a metric.

In [1]:
# To deal with the relative import problems.
import sys
import os
import time
print(os.path.dirname(os.getcwd()))
sys.path.append(f'{os.path.dirname(os.getcwd())}')

# Required packages
import numpy as np
import pandas as pd
import Imputers.utils as utils
import Imputers.em as em
import Imputers.MissForest as MissForest
import Imputers.fcm_impute as Fcm_imputer
import DataQuality.continuous as continuous
from sklearn.impute import KNNImputer, SimpleImputer
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

D:\大學\EDASH\EDASH


Read the missing data you generated.

In [2]:
config = {
    'miss_rate' : [15,20,25,30,35,40,45,50,55,60,65,70,75],
}

result = {
    'em' : [],
    'knn' : [],
    'miss' : [],
    'fcm' : [],
    'mean' : [],
}

In [3]:
for miss_rate in config['miss_rate']:
    m_X_train = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_train.csv")
    y_train = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/y_train.csv")
    m_X_test = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_test.csv")
    y_test = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/y_test.csv")
    
    # For XGB fix
    y_train = y_train-1
    y_test = y_test-1
    # print(m_X_train.shape)
    # print(m_X_test.shape)
    missing_df = pd.concat([m_X_train, m_X_test], axis=0)
    # print(missing_df.shape)
    missing_df.reset_index(drop=True, inplace=True)
    
    # EM Imputer
    result_imputed = em.impute_em(missing_df, 40, 0.03, True, eps_form='relative')
    x_train = result_imputed['X_imputed'][:11823]
    x_test = result_imputed['X_imputed'][11823:]
    
    x_train.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/em_x_train.csv",index=False)
    x_test.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/em_x_test.csv",index=False)
    accuracy, f1 = utils.generate_stack_prediction(x_train, y_train, x_test, y_test)
    result['em'].append([accuracy, f1, result_imputed['time']])
    
    # KNN
    t_start = datetime.now()
    knn = KNNImputer(n_neighbors=3)
    knn_X = knn.fit_transform(missing_df)
    time_s = datetime.now() - t_start
    knn_df = pd.DataFrame(knn_X, columns=missing_df.columns)
    
    x_train = knn_df[:11823]
    x_test = knn_df[11823:]
    
    x_train.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/knn_x_train.csv",index=False)
    x_test.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/knn_x_test.csv",index=False)
    accuracy, f1 = utils.generate_stack_prediction(x_train, y_train, x_test, y_test)
    result['knn'].append([accuracy, f1, time_s])

Iteration 1/40
Convergence Check: Mu:166.3210 | S:546957821.2864
Iteration 2/40
Convergence Check: Mu:40.7342 | S:2143010.1291
Iteration 3/40
Convergence Check: Mu:18.0303 | S:673982.4541
Iteration 4/40
Convergence Check: Mu:8.3069 | S:651068.1308
Iteration 5/40
Convergence Check: Mu:4.1512 | S:660872.2283
Iteration 6/40
Convergence Check: Mu:2.2834 | S:626803.3824
Iteration 7/40
Convergence Check: Mu:1.4691 | S:578312.2664
Iteration 8/40
Convergence Check: Mu:1.1439 | S:529298.9426
Iteration 9/40
Convergence Check: Mu:1.0221 | S:483694.2434
Iteration 10/40
Convergence Check: Mu:0.9726 | S:441966.1484
Iteration 11/40
Convergence Check: Mu:0.9440 | S:403714.8666
Iteration 12/40
Convergence Check: Mu:0.9165 | S:368491.8148
Iteration 13/40
Convergence Check: Mu:0.8821 | S:335998.7667
Iteration 14/40
Convergence Check: Mu:0.8379 | S:306091.7533
Iteration 15/40
Convergence Check: Mu:0.7837 | S:278722.4256
Iteration 16/40
Convergence Check: Mu:0.7216 | S:253862.7439
Iteration 17/40
Convergen


KeyboardInterrupt



## Classification on Original data

In [None]:
c_x_train = pd.read_csv("./data/Gas Sensor Drift Dataset/complete/X_train.csv")
c_x_test = pd.read_csv("./data/Gas Sensor Drift Dataset/complete/X_test.csv")
utils.generate_stack_prediction(c_x_train, y_train, c_x_test, y_test)

## Classification on imputed data (MissForest)

In [None]:
for miss_rate in config['miss_rate']:
    m_X_train = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_train.csv")
    m_X_test = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_test.csv")

    missing_df = pd.concat([m_X_train, m_X_test], axis=0)
    missing_df.reset_index(drop=True, inplace=True)
    mf_imputer = MissForest.MissForest(max_iter = 15)
    start_time = datetime.now()
    mf_df = mf_imputer.fit_transform(missing_df, verbose=False)
    print(f"Execution time: {datetime.now() - start_time}")
    time = datetime.now() - start_time

    x_train = mf_df[:11823]
    x_test = mf_df[11823:]
    
    x_train.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/mf_x_train.csv",index=False)
    x_test.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/mf_x_test.csv",index=False)
    accuracy, f1 = utils.generate_stack_prediction(x_train, y_train, x_test, y_test)

    result['miss'].append([accuracy, f1, time])

In [None]:
display(missing_df.shape)

## Classificaiton on imputed data (fuzzy)

In [None]:
for miss_rate in config['miss_rate']:
    m_X_train = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_train.csv")
    m_X_test = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_test.csv")

    missing_df = pd.concat([m_X_train, m_X_test], axis=0)
    missing_df.reset_index(drop=True, inplace=True)
    # c_X_train = pd.read_csv(f"./data/Gas Sensor Drift Dataset/complete/X_train.csv")
    # c_X_test = pd.read_csv(f"./data/Gas Sensor Drift Dataset/complete/X_test.csv")
    # x_train = pd.concat([c_X_train[0:1391], missing_df[1391:11823]], axis=0)
    # x_test = missing_df[11823:]
    # missing_df = pd.concat([x_train, x_test], axis=0)
    # missing_df.reset_index(drop=True, inplace=True)

    fcmImputer = Fcm_imputer.FCMImputer(data = missing_df, num_clusters = 3)
    s_time = time.time()
    fuzzy_X = fcmImputer.impute()
    duration = time.time() - s_time
   
    x_train = fuzzy_X[:11823]
    x_test = fuzzy_X[11823:]
    
    x_train.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/fcm_x_train.csv",index=False)
    x_test.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/fcm_x_test.csv",index=False)
    accuracy, f1 = utils.generate_stack_prediction(x_train, y_train, x_test, y_test)

    result['fcm'].append([accuracy, f1, duration])

## Mean Impute

In [None]:
for miss_rate in [25,50,75]:
    m_X_train = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_train.csv")
    m_X_test = pd.read_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/X_test.csv")

    missing_df = pd.concat([m_X_train, m_X_test], axis=0)
    missing_df.reset_index(drop=True, inplace=True)
    s_time = time.time()
    mean_imputer = SimpleImputer(strategy='mean')
    mean_imputer.fit(missing_df)
    mean_x = mean_imputer.fit_transform(missing_df)
    duration = time.time() - s_time
    
    x_train = mean_x[:11823]
    x_test = mean_x[11823:]
    
    x_train.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/mean_x_train.csv",index=False)
    x_test.to_csv(f"./data/Gas Sensor Drift Dataset/miss_{miss_rate}/mean_x_test.csv",index=False)
    accuracy, f1 = utils.generate_stack_prediction(x_train, y_train, x_test, y_test)
    result['mean'].append([accuracy, f1, duration])

In [None]:
for i in range(0,3):
    print(result['miss'][i][1])
    print(result['em'][i][1])
    print(result['knn'][i][1])
    print(result['fcm'][i][1])
    print(result['mean'][i][1])

In [None]:
# con = continuous()
# frame = con.comparison(knn_X_df, result_imputed['X_imputed'])