In [23]:
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np



In [3]:
def load_features_from_directory(directory):
    all_features = []
    for file in tqdm(os.listdir(directory), desc="Loading feature files"):
        if file.endswith('_features.csv'):
            file_path = os.path.join(directory, file)
            features_df = pd.read_csv(file_path)
            all_features.append(features_df)
    return pd.concat(all_features, ignore_index=True)

In [4]:
features_train_dir = '/run/media/viblab/Markov2/Haykal/AnakKrakatauEWS/data/features/train'
features_test_dir = '/run/media/viblab/Markov2/Haykal/AnakKrakatauEWS/data/features/test'

train_features = load_features_from_directory(features_train_dir)

Loading feature files:   0%|          | 0/221033 [00:00<?, ?it/s]

In [5]:
test_features = load_features_from_directory(features_test_dir)

Loading feature files:   0%|          | 0/535 [00:00<?, ?it/s]

In [22]:
def impute_missing_values(X):
    imputer = SimpleImputer(strategy='mean')  # Replace 'mean' with 'median' or 'most_frequent' if preferred
    return imputer.fit_transform(X)

In [24]:
def scale_with_progress(X, scaler):
    # Create an array to store the scaled data
    X_scaled = np.zeros_like(X)
    
    # Scale each feature (column) and update progress bar
    with tqdm(total=X.shape[1], desc="Scaling Features") as pbar:
        for i in range(X.shape[1]):
            X_scaled[:, i] = scaler.fit_transform(X[:, [i]].reshape(-1, 1)).ravel()
            pbar.update(1)
    return X_scaled

In [25]:
def apply_pca_with_progress(X, n_components):
    tqdm.write("Applying PCA...")
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    tqdm.write("PCA completed.")
    return X_pca

In [26]:
X_train_imputed = impute_missing_values(train_features)
X_test_imputed = impute_missing_values(test_features)

In [29]:
scaler = StandardScaler()
X_train_scaled = scale_with_progress(X_train_imputed, scaler)
X_test_scaled = scale_with_progress(X_test_imputed, scaler)

Scaling Features:   0%|          | 0/115 [00:00<?, ?it/s]

Scaling Features:   0%|          | 0/115 [00:00<?, ?it/s]

In [30]:
X_train_pca = apply_pca_with_progress(X_train_scaled, n_components=0.95)

Applying PCA...
PCA completed.


In [31]:
X_test_pca = apply_pca_with_progress(X_test_scaled, n_components=0.95)

Applying PCA...
PCA completed.
