In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff
import pprint

import os
import sys
from pathlib import Path

from sklearn.model_selection import train_test_split

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root / "src"))

from paths import RAW_DATA_DIR, SEISMIC_DATA, CLEAN_DATA_DIR, SEED

In [2]:
data, meta = arff.loadarff(SEISMIC_DATA)

In [3]:
seis = pd.DataFrame(data = data, columns = meta.names())
seis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   seismic         2584 non-null   object 
 1   seismoacoustic  2584 non-null   object 
 2   shift           2584 non-null   object 
 3   genergy         2584 non-null   float64
 4   gpuls           2584 non-null   float64
 5   gdenergy        2584 non-null   float64
 6   gdpuls          2584 non-null   float64
 7   ghazard         2584 non-null   object 
 8   nbumps          2584 non-null   float64
 9   nbumps2         2584 non-null   float64
 10  nbumps3         2584 non-null   float64
 11  nbumps4         2584 non-null   float64
 12  nbumps5         2584 non-null   float64
 13  nbumps6         2584 non-null   float64
 14  nbumps7         2584 non-null   float64
 15  nbumps89        2584 non-null   float64
 16  energy          2584 non-null   float64
 17  maxenergy       2584 non-null   f

In [4]:
seis.head()

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class
0,b'a',b'a',b'N',15180.0,48.0,-72.0,-72.0,b'a',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0'
1,b'a',b'a',b'N',14720.0,33.0,-70.0,-79.0,b'a',1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,b'0'
2,b'a',b'a',b'N',8050.0,30.0,-81.0,-78.0,b'a',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0'
3,b'a',b'a',b'N',28820.0,171.0,-23.0,40.0,b'a',1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3000.0,3000.0,b'0'
4,b'a',b'a',b'N',12640.0,57.0,-63.0,-52.0,b'a',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0'


In [5]:
def describe_column(df, col):
    """
    Analyze the data type and summarize values for a given column in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame containing the column to describe.
        col (str): The name of the column to describe.

    Returns:
        dict: A dictionary containing the column name as key and a nested dictionary of:
            - dtype (str): Data type of the column
            - For numeric columns:
                - min (float): Minimum value
                - max (float): Maximum value
                - mean (float): Mean value
                - std (float): Standard deviation
                - n_unique (int): Number of unique values
            - For categorical columns:
                - n_unique (int): Number of unique categories
                - most_common (object): Most frequent category
                - most_common_freq (int): Frequency of the most common category
                - unique_values (list): List of all unique values
    """
    col_data = df[col]
    dtype = col_data.dtype

    if isinstance(col_data.iloc[0], (int, float)):
        desc = {
            "dtype": str(dtype),
            "min": col_data.min(),
            "max": col_data.max(),
            "mean": col_data.mean(),
            "std": col_data.std(),
            "n_unique": col_data.nunique()
        }
    elif isinstance(col_data.iloc[0], (bytes, str)):
        desc = {
            "dtype": str(dtype),
            "n_unique": col_data.nunique(),
            "most_common": col_data.value_counts().idxmax(),
            "most_common_freq": col_data.value_counts().max(),
            "unique_values": col_data.unique().tolist()
        }
    else:
        desc = {
            "dtype": str(dtype),
            "summary": "Unsupported or unknown type"
        }

    return {col: desc}

In [6]:
# Apply to all columns
descriptions = {}
for column in seis.columns:
    descriptions.update(describe_column(seis, column))
pprint.pprint(descriptions)

{'class': {'dtype': 'object',
           'most_common': b'0',
           'most_common_freq': 2414,
           'n_unique': 2,
           'unique_values': [b'0', b'1']},
 'energy': {'dtype': 'float64',
            'max': 402000.0,
            'mean': 4975.270897832817,
            'min': 0.0,
            'n_unique': 242,
            'std': 20450.833222273333},
 'gdenergy': {'dtype': 'float64',
              'max': 1245.0,
              'mean': 12.37577399380805,
              'min': -96.0,
              'n_unique': 334,
              'std': 80.31905050221906},
 'gdpuls': {'dtype': 'float64',
            'max': 838.0,
            'mean': 4.50890092879257,
            'min': -96.0,
            'n_unique': 292,
            'std': 63.1665564838311},
 'genergy': {'dtype': 'float64',
             'max': 2595650.0,
             'mean': 90242.52321981425,
             'min': 100.0,
             'n_unique': 2212,
             'std': 229200.50889447622},
 'ghazard': {'dtype': 'object',
           

In [7]:
def decode_byte_columns(df):
    """
    Decode all byte string values in categorical (object) columns of a DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame with potential byte-string values.

    Returns:
        pd.DataFrame: A new DataFrame with byte strings decoded to UTF-8 strings where applicable.
    """
    df_clean = df.copy()

    for col in df_clean.select_dtypes(include=['object']).columns:
        if isinstance(df_clean[col].iloc[0], bytes):
            df_clean[col] = df_clean[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

    return df_clean

In [8]:
# Apply decoding to the dataset
seis_decoded = decode_byte_columns(seis)
seis_decoded.head()

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class
0,a,a,N,15180.0,48.0,-72.0,-72.0,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,a,a,N,14720.0,33.0,-70.0,-79.0,a,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2000.0,2000.0,0
2,a,a,N,8050.0,30.0,-81.0,-78.0,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,a,a,N,28820.0,171.0,-23.0,40.0,a,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3000.0,3000.0,0
4,a,a,N,12640.0,57.0,-63.0,-52.0,a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [9]:
def class_to_int(class_col):
    """
    Convert a string-based binary class column to integer values 0 and 1.

    Args:
        class_col (Iterable): A column or list-like object with values '0' and '1' as strings.

    Returns:
        list: A list of integers where '0' becomes 0 and all other values become 1.
    """
    return [0 if x == '0' else 1 for x in class_col]

In [10]:
seis_decoded['class'] = class_to_int(seis_decoded['class'])

round(seis_decoded['class'].mean(),4)*100

6.58

there is some class imbalance, but I'd imagine it'll be fine

In [11]:
categorical_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard']
seis_decoded = pd.get_dummies(seis_decoded, columns=categorical_cols, drop_first=True)

In [12]:
seis_decoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genergy           2584 non-null   float64
 1   gpuls             2584 non-null   float64
 2   gdenergy          2584 non-null   float64
 3   gdpuls            2584 non-null   float64
 4   nbumps            2584 non-null   float64
 5   nbumps2           2584 non-null   float64
 6   nbumps3           2584 non-null   float64
 7   nbumps4           2584 non-null   float64
 8   nbumps5           2584 non-null   float64
 9   nbumps6           2584 non-null   float64
 10  nbumps7           2584 non-null   float64
 11  nbumps89          2584 non-null   float64
 12  energy            2584 non-null   float64
 13  maxenergy         2584 non-null   float64
 14  class             2584 non-null   int64  
 15  seismic_b         2584 non-null   bool   
 16  seismoacoustic_b  2584 non-null   bool   


### Put it all together
Let's write this into one cleaning + modeling prep function that we can call from a script.

In [13]:
def clean_prep_modeling(df, seed):
    """
    Clean and prepare a dataset for modeling, including:
    - Column summaries
    - Byte decoding
    - Class conversion
    - One-hot encoding
    - Train/test splitting

    Args:
        df (pd.DataFrame): Input raw dataset
        seed (int): Random seed for reproducibility

    Returns:
        dict: Dictionary containing Xtrain, Xtest, yTrain, ytest
    """
    
    ## initiate empty dictionary
    descriptions = {}

    ## get info about columns and print it to console
    for column in df.columns:
        descriptions.update(describe_column(df, column))
    
    ## suppress unless debug is on
    pprint.pprint(descriptions, verbose = False)

    ### remove the byte encoding
    df = decode_byte_columns(df)

    ### set the class as an int
    df['class'] = class_to_int(df['class'])

    ### one-hot encode the data
    categorical_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard']
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    ### train test split the data
    train, test = train_test_split(df, test_size = .80, random_state=seed)

    ## initialize output dict
    output = {}
    output['yTrain'] = train['class']
    output['ytest'] = test['class']
    output['Xtrain'] = train.drop('class', axis = 1)
    output['Xtest'] = test.drop('class', axis = 1)

    return output


This is going to go in data_cleaning.py. We can execute it from the command line to make this easy.

The file will be saved as a pickle because it's a great way to store dictionaries.