# Cleaning data

In [1]:
import pandas as pd

In [2]:
def read_soo(file_obj: str) -> pd.DataFrame:
    """
    Read soo list txt file data.

    Parameter
    ----------
    file_obj : str
        file object string

    Return
    ------
    pd.DataFrame
        Loaded data
    """
    df = pd.read_table(
        file_obj,
        sep="\t",
        header=0,
        names=[
            "line",
            "station",
            "time",
            "latitude",
            "longitude",
            "depth",
            "temperature",
            "temperature_qc",
            "salinity",
            "salinity_qc",
        ],
        index_col=["line", "station"],
        usecols=[1, 2, 3, 4, 5, 7, 8, 9, 10, 11],
    )
    decimals = pd.Series([3, 3], index=["latitude", "longitude"])
    df = df.round(decimals)
    return df

In [3]:
def target(df: pd.DataFrame, line: int, station: list) -> pd.DataFrame:
    """
    Select target station data.

    Parameters
    ----------
    df : pd.DataFrame
        pandas.DataFrame type input data.
    line : int
        line number integer.
    station : list
        list container for station numbers.

    Return
    ------
    pd.DataFrame
        targeted station data
    """
    target = df.xs(line, level="line")
    target = target[target.index.isin(station)]
    return target.reset_index(drop=True)

In [4]:
def validate(df: pd.DataFrame) -> pd.DataFrame:
    """
    Validate quality control for preparing data.

    Parameter
    ---------
    df : pd.DataFrame
        targeted station data

    Return
    ------
    pd.DataFrame
        validated pandas.DataFrame data
    """
    mask = (
        df["temperature"].notna()
        & (df["temperature"] >= 2.0)
        & (df["temperature_qc"] == 1)
        & (df["salinity_qc"] == 1)
    )
    return df[mask].iloc[:, 0:5]

In [5]:
if __name__ == "__main__":
    file_list = [f"../data/raw/{year}.txt" for year in range(1969, 2025)]

    df = None
    for file_obj in file_list:
        if df is None:
            df = read_soo(file_obj)
        else:
            df = pd.concat([df, read_soo(file_obj)])

    df_valid = validate(df)

    line_208 = target(df_valid, 208, [1, 2, 3, 4])
    line_208.to_csv("../data/cleaned_line_208.csv", index=None, encoding="utf-8")