This JNotebook is the guide to create an EDA/Pre-processing workflow

In [None]:
# Loading needed Libaries

#data handling libaries
import numpy as np
import pandas as pd
#plotting libaries
import matplotlib.pyplot as plt
import seaborn as sns
#maschine learning libaries
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import set_config


# Setting random seed 
RSEED = 42

In [None]:
# Splitting target and features

def split_target_features(df):
    X = df.drop("target", axis=1)
    y = df.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, shuffle=True, random_state=RSEED)
    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    print(f'Your data_test shape is {df_test.shape()}')
    print(f'Your data_train shape is {df_train.shape()}')
    return  X_train, X_test, y_train, y_test, df_train, df_test


In [None]:
def floating_time(df_train):
    df_train["ordinal_date"] = df_train.Date.astype("datetime64").apply(lambda x: x.toordinal())
    return df_train

In [None]:
def windspeed(df_train):
    df_train["wind_speed"] = df_train.apply(lambda x: np.sqrt(x["u_component_of_wind_10m_above_ground"]**2 + x["v_component_of_wind_10m_above_ground"]**2), axis=1)
    return df_train

In [None]:
def reduce_df(df_train, df_test):
    cols_short = [col for col in df_train.columns if not "angle" in col \
     and not "sensor" in col \
     and not "ID" in col \
     and not "Date" in col \
     and not "target_" in col \
     and not "water" in col \
     and not "_of_wind" in col \
     and not "CH4" in col]
    return cols_short

In [None]:
def impute_df(df_train,cols_short):
    set_config(transform_output="pandas")
    imputer = SimpleImputer(strategy="median")
    df_imp = imputer.fit_transform(df_train[cols_short])
    return df_imp, imputer

In [None]:
def transform_df(df_imp):
    log_cols = ["L3_CLOUD_cloud_optical_depth", "L3_SO2_SO2_column_number_density_amf", "wind_speed", "L3_O3_O3_effective_temperature", "L3_NO2_NO2_slant_column_number_density", "L3_O3_O3_column_number_density", "L3_NO2_tropospheric_NO2_column_number_density", "L3_NO2_NO2_column_number_density", "L3_CO_H2O_column_number_density", "L3_CLOUD_cloud_base_height", "L3_CLOUD_surface_albedo",
    "L3_HCHO_tropospheric_HCHO_column_number_density_amf", "L3_CO_CO_column_number_density", "specific_humidity_2m_above_ground",
    "L3_CLOUD_cloud_top_height"]
    df_imp[log_cols] = df_imp[log_cols].applymap(lambda x: np.log(x))
    return df_imp
transform_df(df_imp)

In [16]:
class columnDropTransformer():
    def __init__(self, ex_str):
        self.ex_str=ex_str

    def transform(self, X, y=None):
        keep_cols =  [col for col in X.columns if not any([s in col for s in self.ex_str])]
        return X[keep_cols]

    def fit(self, X, y=None):
        return self

In [17]:
class ordinalDateTransformer():
    def __init__(self):
        pass

    def transform(self, X, y=None):
        X["ordinal_date"] = X.Date.astype("datetime64[s]").apply(lambda x: x.toordinal())
        return X

    def fit(self, X, y=None):
        return self

In [18]:
class windTransformer():
    def __init__(self, columns):
        self.columns=columns

    def transform(self, X, y=None):
        X["wind_speed"] = X.apply(lambda x: np.sqrt(x[self.columns[0]]**2 + x[self.columns[1]]**2), axis=1)
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np

In [23]:
wind_cols = ["u_component_of_wind_10m_above_ground", "v_component_of_wind_10m_above_ground"]
ex_str = ["angle", "sensor", "ID", "Date", "target_", "water", "_of_wind", "CH4"]

preprocessor = Pipeline([
    ("ordinal_date", ordinalDateTransformer()),
    ("wind_transform", windTransformer(wind_cols)),
    ("drop_cols", columnDropTransformer(ex_str)),
    ("impute", SimpleImputer(strategy="median")),
    ("logit", FunctionTransformer(np.log)),
])