nacitanie dat, spojenie new_obesrvation s new_station
rozdelenie na trainig data a testing data casti
vsetky atribute normlaizovat aby boli numericke -> vhodne pre ML (pricom kategoricke features  budu continenty, vynechavame staty a mesta)



In [41]:
import pandas as pd
import numpy as np
import dateparser

In [42]:
class ParseLocation:
    def __init__(self, column='location'):
        self.column = column

    def transform(self, df):
        df = df.copy()

        if self.column not in df.columns:
            raise ValueError(f"Column '{self.column}' not found in DataFrame.")

        split_cols = df[self.column].astype(str).str.split('/', n=1, expand=True)

        df['continent'] = split_cols[0].str.strip()
        df['city'] = split_cols[1].str.strip()

        df = df.drop(columns=[self.column])

        return df

In [43]:
class Clean:
    def __init__(self):
        pass

    def transform(self, df):
        df = df.copy()
        for col in df.select_dtypes('object').columns:
            df[col] = (
                df[col]
                .astype(str)
                .str.strip()
                .str.lower()
                .str.replace(r'[^a-z0-9\s,.-]', '', regex=True)
            )
        return df


In [44]:
class DropFeatures:
    def __init__(self, columns):
        self.columns = columns

    def transform(self, df):
        return df.drop(columns=self.columns, errors='ignore')


In [45]:
class RemoveDuplicates:
    def __init__(self):
        pass
        
    def transform(self, df):
        return df.drop_duplicates()

In [46]:
class AddFromTable:
    def __init__(self, data, condition, columns=None):
        self.data = data
        if isinstance(condition, str):
            self.condition = [condition]
        else:
            self.condition = condition
        self.columns = columns

    def transform(self, df):
        df = df.copy()
        right = self.data.copy()

        cols_to_use = self.condition.copy()
        if self.columns:
            cols_to_use += [c for c in self.columns if c in right.columns]

        right = right[cols_to_use]
        return df.merge(right, on=self.condition, how='left')

In [47]:
class DealWithNull:
    def __init__(self, remove=None, replace=None, replace_method=None):
        self.remove = remove
        self.replace = replace
        self.replace_method = replace_method

    def transform(self, df):
        df = df.copy()
        if self.remove:
            df = df.dropna(subset=self.remove)
        if self.replace and self.replace_method is not None:
            for col in self.replace:
                if col not in df.columns:
                    continue

                method = self.replace_method
                if method == "mean":
                    df[col] = df[col].fillna(df[col].mean())
                elif method == "median":
                    df[col] = df[col].fillna(df[col].median())
                elif method == "mode":
                    df[col] = df[col].fillna(df[col].mode()[0])
                else:
                    df[col] = df[col].fillna(method)

        return df


In [48]:
class DealWithOutliers:
    def __init__(self, IQR=None, cap=None, cap_t=None):
        self.IQR = IQR
        self.cap = cap
        self.cap_t = cap_t

    def transform(self, df):
        df = df.copy()

        if self.IQR:
            for col in self.IQR:
                if col not in df.columns:
                    continue

                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1

                lower = Q1 - 1.5 * IQR
                upper = Q3 + 1.5 * IQR

                df = df[(df[col] >= lower) & (df[col] <= upper)]

        if self.cap and self.cap_t:
            low_p, high_p = self.cap_t, 100 - self.cap_t

            for col in self.cap:
                if col not in df.columns:
                    continue

                lower = np.percentile(df[col].dropna(), low_p)
                upper = np.percentile(df[col].dropna(), high_p)

                df[col] = np.clip(df[col], lower, upper)

        return df


In [49]:
#class Preprocess:

In [50]:
#class ExtractFeatures:

In [51]:
#class EnforceSchema:

In [52]:
class Pipeline:
    def __init__(self, steps):
        self.steps = steps

    def enforce_schema(self, df, schema):
        df = df.copy()

        for col, col_type in schema.items():
            if col not in df.columns:
                continue

            if col_type == 'int':
                df[col] = pd.to_numeric(df[col], errors='coerce').astype(int)

            elif col_type == 'float':
                df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)

            elif col_type == 'numeric':
                df[col] = pd.to_numeric(df[col], errors='coerce')


            elif col_type == 'date':
                df[col] = df[col].apply(
                    lambda x: dateparser.parse(str(x))
                    if pd.notnull(x) else pd.NaT
                )

            elif col_type == 'string':
                df[col] = df[col].astype(str)
                df[col] = df[col].replace('nan', np.nan)

            elif col_type == 'bool':
                df[col] = (
                    df[col]
                    .astype(str)
                    .str.lower()
                    .map({'true': True, 'false': False, '1': True, '0': False})
                )

            elif col_type == 'category':
                df[col] = df[col].astype('category')

        return df
    
    def run(self, data, schema=None):

        if schema is not None:
            data = self.enforce_schema(data, schema)
        
        for step in self.steps:
            data = step.transform(data)
        return data

In [82]:
pipeline1_station = Pipeline([
    ParseLocation(),
    DropFeatures(columns=['revision']),
    Clean(),
    RemoveDuplicates(),
])

pipeline1 = Pipeline([
    AddFromTable(data=station_cleaned,condition=['latitude', 'longitude'],columns=['station','code','continent','city']),
    Clean(),
    RemoveDuplicates(),
    DealWithNull(remove=[
                        'oximetry'],
                 replace=[
                         'SpO₂', 'HR', 'PI', 'RR', 'EtCO₂', 'FiO₂',
                         'PRV', 'BP', 'Skin Temperature', 'Motion/Activity index',
                         'PVI', 'Hb level', 'SV', 'CO', 'Blood Flow Index','PPG waveform features',
                         'Signal Quality Index', 'Respiratory effort', 'O₂ extraction ratio', 'SNR'
                 ],
                 replace_method= "mean"),
    DealWithOutliers(IQR= [],
                     cap=[
                         'SpO₂', 'HR', 'PI', 'RR', 'EtCO₂', 'FiO₂',
                         'PRV', 'BP', 'Skin Temperature', 'Motion/Activity index',
                         'PVI', 'Hb level', 'SV', 'CO', 'Blood Flow Index','PPG waveform features',
                         'Signal Quality Index', 'Respiratory effort', 'O₂ extraction ratio', 'SNR'
                     ],
                     cap_t= 5),
    #Preprocess(encode= ,transform=, scale= ),
    DropFeatures(columns=['latitude', 'longitude']),
    #SelectFeatures(column=),
    RemoveDuplicates()            
])

pipeline2 = Pipeline([
    #Clean()
    #RemoveDuplicates(),
    #DealWithNull(remove= ,replace= ,replace_method= ),
    #DealWithOutliers(IQR= ,cap= ,cap_t= ),
    #Preprocess(encode= ,transform=, scale= ),
    #DropFeatures(columns=)
    #ExtractFeatures(column=),
    #RemoveDuplicates()            
])

In [83]:
station_schema = {
    'location':'string',
    'code':'string',
    'revision':'date',
    'station':'string',
    'latitude':'float',
    'longitude':'float',
}

In [84]:
observation_schema = {
    'SpO₂':'float',
    'HR':'float',
    'PI':'float',
    'RR':'float',
    'EtCO₂':'float',
    'FiO₂':'float',
    'PRV':'float',
    'BP':'float',
    'Skin Temperature':'float',
    'Motion/Activity index':'float',
    'PVI':'float',
    'Hb level':'float',
    'SV':'float',
    'CO':'float',
    'Blood Flow Index':'float',
    'PPG waveform features':'float',
    'Signal Quality Index':'float',
    'Respiratory effort':'float',
    'O₂ extraction ratio':'float',
    'SNR':'float',
    'oximetry':'int',
    'latitude':'float',
    'longitude':'float'
}

In [85]:
observation = pd.read_csv("dataset/observation.csv", sep='\t')
station = pd.read_csv("dataset/station.csv", sep='\t')

station_cleaned = pipeline1_station.run(station, station_schema)
df_cleaned = pipeline1.run(observation, observation_schema)



In [86]:
df_cleaned.head()

Unnamed: 0,SpO₂,HR,PI,RR,EtCO₂,FiO₂,PRV,BP,Skin Temperature,Motion/Activity index,...,PPG waveform features,Signal Quality Index,Respiratory effort,O₂ extraction ratio,SNR,oximetry,station,code,continent,city
0,96.511604,70.263434,14.451123,17.461063,41.262037,78.519798,126.965235,109.471152,35.650826,11.429092,...,36.60455,69.140438,57.097309,0.210117,33.584512,1,paracho de verduzco,mx,america,mexicocity
1,98.113516,72.8729,4.699563,17.231104,40.220086,64.283914,139.509502,100.943658,35.313317,11.188645,...,61.305805,50.733704,61.220158,0.293664,30.528645,1,lutz,us,america,newyork
2,98.623248,81.418306,12.056504,16.832868,39.953184,77.164206,104.396821,107.401302,36.017931,8.980842,...,49.432273,41.841466,57.554854,0.232518,22.357337,1,frankston south,au,australia,melbourne
3,96.821905,70.263434,11.04441,14.876013,38.765113,59.296747,147.612105,106.786082,35.433515,9.952747,...,68.710875,47.524447,48.971775,0.288125,25.88619,0,port richmond,us,america,newyork
4,98.523262,70.686313,5.963887,16.933547,41.470854,66.145767,111.525074,108.354216,35.258355,10.619401,...,33.993656,60.323832,54.807359,0.294852,20.970612,1,gua musang,my,asia,kualalumpur
