In [9]:
import pandas as pd
import numpy as np
from scipy.io import arff
import pprint

import os
import sys
from pathlib import Path

project_root = Path().resolve().parents[1]
sys.path.append(str(project_root / "src"))

from paths import RAW_DATA_DIR, SEISMIC_DATA

In [4]:
data, meta = arff.loadarff(SEISMIC_DATA)

In [8]:
seis = pd.DataFrame(data = data, columns = meta.names())
seis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   seismic         2584 non-null   object 
 1   seismoacoustic  2584 non-null   object 
 2   shift           2584 non-null   object 
 3   genergy         2584 non-null   float64
 4   gpuls           2584 non-null   float64
 5   gdenergy        2584 non-null   float64
 6   gdpuls          2584 non-null   float64
 7   ghazard         2584 non-null   object 
 8   nbumps          2584 non-null   float64
 9   nbumps2         2584 non-null   float64
 10  nbumps3         2584 non-null   float64
 11  nbumps4         2584 non-null   float64
 12  nbumps5         2584 non-null   float64
 13  nbumps6         2584 non-null   float64
 14  nbumps7         2584 non-null   float64
 15  nbumps89        2584 non-null   float64
 16  energy          2584 non-null   float64
 17  maxenergy       2584 non-null   f

In [7]:
seis.head()

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class
0,b'a',b'a',b'N',15180.0,48.0,-72.0,-72.0,b'a',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0'
1,b'a',b'a',b'N',14720.0,33.0,-70.0,-79.0,b'a',1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,b'0'
2,b'a',b'a',b'N',8050.0,30.0,-81.0,-78.0,b'a',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0'
3,b'a',b'a',b'N',28820.0,171.0,-23.0,40.0,b'a',1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3000.0,3000.0,b'0'
4,b'a',b'a',b'N',12640.0,57.0,-63.0,-52.0,b'a',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0'


In [14]:
def describe_column(df, col):
    """
    Analyze the data type and summarize values for a given column in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame containing the column to describe.
        col (str): The name of the column to describe.

    Returns:
        dict: A dictionary containing the column name as key and a nested dictionary of:
            - dtype (str): Data type of the column
            - For numeric columns:
                - min (float): Minimum value
                - max (float): Maximum value
                - mean (float): Mean value
                - std (float): Standard deviation
                - n_unique (int): Number of unique values
            - For categorical columns:
                - n_unique (int): Number of unique categories
                - most_common (object): Most frequent category
                - most_common_freq (int): Frequency of the most common category
                - unique_values (list): List of all unique values
    """
    col_data = df[col]
    dtype = col_data.dtype

    if isinstance(col_data.iloc[0], (int, float)):
        desc = {
            "dtype": str(dtype),
            "min": col_data.min(),
            "max": col_data.max(),
            "mean": col_data.mean(),
            "std": col_data.std(),
            "n_unique": col_data.nunique()
        }
    elif isinstance(col_data.iloc[0], (bytes, str)):
        desc = {
            "dtype": str(dtype),
            "n_unique": col_data.nunique(),
            "most_common": col_data.value_counts().idxmax(),
            "most_common_freq": col_data.value_counts().max(),
            "unique_values": col_data.unique().tolist()
        }
    else:
        desc = {
            "dtype": str(dtype),
            "summary": "Unsupported or unknown type"
        }

    return {col: desc}

In [19]:
# Apply to all columns
descriptions = {}
for column in seis.columns:
    descriptions.update(describe_column(seis, column))
pprint.pprint(descriptions)

{'class': {'dtype': 'object',
           'most_common': b'0',
           'most_common_freq': 2414,
           'n_unique': 2,
           'unique_values': [b'0', b'1']},
 'energy': {'dtype': 'float64',
            'max': 402000.0,
            'mean': 4975.270897832817,
            'min': 0.0,
            'n_unique': 242,
            'std': 20450.833222273333},
 'gdenergy': {'dtype': 'float64',
              'max': 1245.0,
              'mean': 12.37577399380805,
              'min': -96.0,
              'n_unique': 334,
              'std': 80.31905050221906},
 'gdpuls': {'dtype': 'float64',
            'max': 838.0,
            'mean': 4.50890092879257,
            'min': -96.0,
            'n_unique': 292,
            'std': 63.1665564838311},
 'genergy': {'dtype': 'float64',
             'max': 2595650.0,
             'mean': 90242.52321981425,
             'min': 100.0,
             'n_unique': 2212,
             'std': 229200.50889447622},
 'ghazard': {'dtype': 'object',
           

In [20]:
def decode_byte_columns(df):
    """
    Decode all byte string values in categorical (object) columns of a DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame with potential byte-string values.

    Returns:
        pd.DataFrame: A new DataFrame with byte strings decoded to UTF-8 strings where applicable.
    """
    df_clean = df.copy()

    for col in df_clean.select_dtypes(include=['object']).columns:
        if isinstance(df_clean[col].iloc[0], bytes):
            df_clean[col] = df_clean[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

    return df_clean

In [21]:
# Apply decoding to the dataset
seis_decoded = decode_byte_columns(df)
seis_decoded.head()

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class
0,a,a,N,15180.0,48.0,-72.0,-72.0,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,a,a,N,14720.0,33.0,-70.0,-79.0,a,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,0
2,a,a,N,8050.0,30.0,-81.0,-78.0,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,a,a,N,28820.0,171.0,-23.0,40.0,a,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3000.0,3000.0,0
4,a,a,N,12640.0,57.0,-63.0,-52.0,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [37]:
def class_to_int(class_col):
    """
    Convert a string-based binary class column to integer values 0 and 1.

    Args:
        class_col (Iterable): A column or list-like object with values '0' and '1' as strings.

    Returns:
        list: A list of integers where '0' becomes 0 and all other values become 1.
    """
    return [0 if x == '0' else 1 for x in class_col]

In [40]:
categorical_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard']
seis_decoded = pd.get_dummies(seis_decoded, columns=categorical_cols, drop_first=True)

In [41]:
seis_decoded

Unnamed: 0,genergy,gpuls,gdenergy,gdpuls,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,...,nbumps89,energy,maxenergy,class,seismic_b,seismoacoustic_b,seismoacoustic_c,shift_W,ghazard_b,ghazard_c
0,15180.0,48.0,-72.0,-72.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,False,False,False,False,False,False
1,14720.0,33.0,-70.0,-79.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,2000.0,2000.0,0,False,False,False,False,False,False
2,8050.0,30.0,-81.0,-78.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,False,False,False,False,False,False
3,28820.0,171.0,-23.0,40.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,3000.0,3000.0,0,False,False,False,False,False,False
4,12640.0,57.0,-63.0,-52.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2579,81410.0,785.0,432.0,151.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,True,False,False,True,True,False
2580,42110.0,555.0,213.0,118.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,True,False,False,True,False,False
2581,26960.0,540.0,101.0,112.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,True,False,False,True,False,False
2582,16130.0,322.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,False,False,False,True,False,False
