In [27]:
# Importing necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [20]:
def convertDateTimeToComponents(data, column):
    data["year"] = data[column].dt.year
    data["month"] = data[column].dt.month
    data["day"] = data[column].dt.day
    data["hour"] = data[column].dt.hour
    data["season"] = ((data["month"] & 12 + 3) // 3) + 1
    return data

def prepareData(data):
    data.drop(["id"], axis = 1, inplace = True)  # Convert 'valid_time' to datetime
    data["valid_time"] = pd.to_datetime(data["valid_time"])
    data = convertDateTimeToComponents(data, "valid_time")
    data.drop(["valid_time"], axis = 1, inplace = True)
    
era5_train = pd.read_csv("train.csv", sep=",")
era5_test = pd.read_csv("test.csv", sep=",")

prepareData(era5_train)
prepareData(era5_test)

In [25]:
attributes = list(era5_train.columns)
one_hot_attributes = ["year", "month", "day", "hour", "season", "ptype"]
numerical_attributes = list(filter(lambda item: item not in one_hot_attributes + ["t2m"], attributes))
print(one_hot_attributes)
print(numerical_attributes)

['year', 'month', 'day', 'hour', 'season', 'ptype']
['latitude', 'longitude', 'tp', 'u10', 'v10', 'sp', 'u100', 'v100', 'tcc']


In [29]:
def encodeData(data, one_hot_attributes, numerical_attributes):
    x = data.drop("t2m", axis=1)
    y = data["t2m"]

    onehot_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder())
    ])

    # Defining pipeline for numerical scaling
    numerical_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    transformers = [
        ("cat-onehot", onehot_pipeline, one_hot_attributes),
        ("numerical", numerical_pipeline, numerical_attributes)
    ]

    full_pipeline = ColumnTransformer(transformers=transformers)
    x_prepared = full_pipeline.fit_transform(x)
    return x_prepared, y

x_train, y_train =encodeData(era5_train, one_hot_attributes, numerical_attributes)
#x_test, y_test = encodeData(era5_test, one_hot_attributes, numerical_attributes)