## Explatory Data Analysis

In [3]:
import sys
sys.path.append('/run/media/viblab/Markov2/Haykal/AnakKrakatauEWS/')

import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
from scipy.fft import fft
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import re

import warnings
warnings.filterwarnings('ignore')



### Load Data

Load the dataset from 7 stations for all the period. Also apply a min-max scaler for each sniplet in preprocessing

In [9]:
# Function to process files
def process_file(file_path, scaler):
    try:
        df = pd.read_csv(file_path)
        # Drop the timestamp and optionally 'I06H5' sensor column
        columns_to_drop = [df.columns[0]]  # First column assumed to be timestamp
        if 'I06H5' in df.columns:
            columns_to_drop.append('I06H5')
        sensor_data = df.drop(columns=columns_to_drop)
        
        # Normalize the sensor data
        normalized_sensor_data = scaler.fit_transform(sensor_data)
        normalized_df = pd.DataFrame(normalized_sensor_data, columns=sensor_data.columns)
        return normalized_df
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

In [13]:
def extract_detection_number(filename):
    match = re.search(r'IM_det_(\d+)_', filename)
    return int(match.group(1)) if match else None

In [14]:
data_dir = '/run/media/viblab/Markov2/Haykal/AnakKrakatauEWS/data/processed/I06AU_FULL'
scaler = MinMaxScaler()

file_info = [(f, extract_detection_number(f)) for f in os.listdir(data_dir) if f.endswith('.csv')]
file_info.sort(key=lambda x: x[1])
split_detection_count = 636

train_files = [f for f, count in file_info if count < split_detection_count]
test_files = [f for f, count in file_info if count >= split_detection_count]

train_data = [process_file(os.path.join(data_dir, f), scaler) for f in train_files]
test_data = [process_file(os.path.join(data_dir, f), scaler) for f in test_files]

# Remove any None entries if errors occurred during processing
train_data = [df for df in train_data if df is not None]
test_data = [df for df in test_data if df is not None]


In [15]:
for i, df in enumerate(train_data):
    print(f"Descriptive Statistics for Train File {i}:")
    print(df.describe())

Descriptive Statistics for Train File 0:
            I06H1       I06H2       I06H3       I06H4       I06H6       I06H7  \
count  101.000000  101.000000  101.000000  101.000000  101.000000  101.000000   
mean     0.543509    0.530231    0.512076    0.495305    0.501829    0.504417   
std      0.201744    0.187111    0.222996    0.249626    0.204417    0.225825   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.403875    0.423552    0.355237    0.294858    0.358238    0.357565   
50%      0.546244    0.519381    0.496840    0.473575    0.521849    0.465339   
75%      0.697720    0.654789    0.684979    0.684127    0.651522    0.660782   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

            I06H8  
count  101.000000  
mean     0.530064  
std      0.202608  
min      0.000000  
25%      0.414658  
50%      0.543248  
75%      0.659789  
max      1.000000  
Descriptive Statistics for Train File 1:
        

            I06H1       I06H2       I06H3       I06H4       I06H6       I06H7  \
count  101.000000  101.000000  101.000000  101.000000  101.000000  101.000000   
mean     0.465296    0.416199    0.372762    0.416829    0.481716    0.594629   
std      0.184094    0.184398    0.236613    0.183386    0.223087    0.201830   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.349622    0.289358    0.211346    0.295566    0.322804    0.477105   
50%      0.465784    0.408190    0.337039    0.427492    0.461405    0.619197   
75%      0.574537    0.510244    0.521387    0.514895    0.621069    0.730267   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

            I06H8  
count  101.000000  
mean     0.515363  
std      0.174089  
min      0.000000  
25%      0.408157  
50%      0.502660  
75%      0.609715  
max      1.000000  
Descriptive Statistics for Train File 35:
            I06H1       I06H2       I06H3       

In [16]:
# For testing data
for i, df in enumerate(test_data):
    print(f"Descriptive Statistics for Test File {i}:")
    print(df.describe())

Descriptive Statistics for Test File 0:
            I06H1       I06H2       I06H3       I06H4       I06H6       I06H7  \
count  101.000000  101.000000  101.000000  101.000000  101.000000  101.000000   
mean     0.510171    0.543136    0.485369    0.486018    0.563567    0.553204   
std      0.198332    0.195578    0.225974    0.203103    0.206523    0.216053   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.397018    0.470301    0.339692    0.371148    0.441000    0.465684   
50%      0.513141    0.561812    0.504680    0.499468    0.551568    0.555762   
75%      0.645451    0.631896    0.616506    0.590269    0.683045    0.674142   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

            I06H8  
count  101.000000  
mean     0.505581  
std      0.184563  
min      0.000000  
25%      0.414388  
50%      0.473009  
75%      0.583307  
max      1.000000  
Descriptive Statistics for Test File 1:
          

            I06H1       I06H2       I06H3       I06H4       I06H6       I06H7  \
count  101.000000  101.000000  101.000000  101.000000  101.000000  101.000000   
mean     0.518507    0.461203    0.524304    0.589716    0.499892    0.577092   
std      0.262240    0.293596    0.209556    0.218876    0.261716    0.219578   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.314226    0.180670    0.384621    0.448888    0.270986    0.448815   
50%      0.524368    0.473818    0.507853    0.602827    0.552334    0.628434   
75%      0.702371    0.720892    0.614374    0.761613    0.680352    0.724212   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

            I06H8  
count  101.000000  
mean     0.540971  
std      0.197418  
min      0.000000  
25%      0.407397  
50%      0.524853  
75%      0.667669  
max      1.000000  
Descriptive Statistics for Test File 18:
            I06H1       I06H2       I06H3       I

### Data Windowing

In [26]:
def windowing(data, window_size, overlap_size):
    windows = []
    for start in range(0, len(data), window_size - overlap_size):
        end = start + window_size
        if end <= len(data):
            window = data.iloc[start:end]
            windows.append(window)  # Keep as DataFrame
    return windows


In [27]:
def save_windows_to_csv(windowed_data, directory, file_prefix):
    os.makedirs(directory, exist_ok=True)
    for filename, windows in windowed_data.items():
        detection_number = extract_detection_number(filename)
        for i, window in enumerate(windows):
            csv_filename = os.path.join(directory, f'Det_{detection_number}_{i}.csv')
            window.to_csv(csv_filename, index=False)


In [29]:
# Parameters for windowing
window_size = 100  # Define your window size
overlap_size = 0  # Define your overlap size

# Process and save the train data
windowed_train_data = {filename: windowing(df, window_size, overlap_size) 
                       for filename, df in zip(train_files, train_data)}


# Process and save the test data
windowed_test_data = {filename: windowing(df, window_size, overlap_size) 
                      for filename, df in zip(test_files, test_data)}


In [30]:
# Directories for saving windowed train and test data
windowed_train_dir = '/run/media/viblab/Markov2/Haykal/AnakKrakatauEWS/data/train'
windowed_test_dir = '/run/media/viblab/Markov2/Haykal/AnakKrakatauEWS/data/test'
os.makedirs(windowed_train_dir, exist_ok=True)
os.makedirs(windowed_test_dir, exist_ok=True)

# Save the windowed data
save_windows_to_csv(windowed_train_data, windowed_train_dir, 'windowed_train')
save_windows_to_csv(windowed_test_data, windowed_test_dir, 'windowed_test')


### Visualization