In [1]:
import pandas as pd
import numpy as np
import warnings
from math import sqrt
warnings.filterwarnings('ignore')
import pickle

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('weather_dataset_raw.csv')



df.head()


Unnamed: 0,S_No,Timestamp,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Weather_conditions
0,0,2006-04-01 00:00:00+02:00,"Port of Turku, Finland",9.472222,7.388889,0.89,14.1197,251,15.8263,1015.13,rain
1,1,2006-04-01 01:00:00+02:00,"Port of Turku, Finland",9.355556,7.227778,0.86,14.2646,259,15.8263,1015.63,rain
2,2,2006-04-01 02:00:00+02:00,"Port of Turku, Finland",9.377778,9.377778,0.89,3.9284,204,14.9569,1015.94,rain
3,3,2006-04-01 03:00:00+02:00,"Port of Turku, Finland",8.288889,5.944444,0.83,14.1036,269,15.8263,1016.41,
4,4,2006-04-01 04:00:00+02:00,"Port of Turku, Finland",8.755556,6.977778,0.83,11.0446,259,15.8263,1016.51,rain


In [None]:
class DataFrameFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, target_column):
        self.target_column = target_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        data = X.copy()
        col_name = "pass__"+self.target_column
        data_features = data.drop(columns=[col_name, 'pass__Timestamp'])
        y_target = data[col_name]
        return data_features, y_target
    
    

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import TransformedTargetRegressor
from sklearn.base import BaseEstimator, TransformerMixin

class OneHotEncoderDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.encoder = OneHotEncoder(**kwargs)
        self.column_names = None

    def fit(self, X, y=None):
        self.encoder.fit(X)
        self.column_names = self.encoder.get_feature_names_out(X.columns)
        return self

    def transform(self, X):
        return pd.DataFrame(self.encoder.transform(X).toarray(), columns=self.column_names)
    
    def get_feature_names_out(self):
        return self.column_names
    
class TransformerToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, base_transformer):
        self.base_transformer = base_transformer
        self.column_names = None  # Initialize column names as None

    def fit(self, X, y=None):
        self.base_transformer.fit(X, y)
        # Capture the column names during fitting
        self.column_names = X.columns.tolist()
        return self

    def transform(self, X):
        # Apply the transformation
        X_transformed = self.base_transformer.transform(X)

        # Convert the transformed array back to a DataFrame
        return pd.DataFrame(X_transformed, columns=self.column_names, index=X.index)
    
    def get_feature_names_out(self):
        return self.column_names


    
    
class DataFrameColumnTransformer(ColumnTransformer):
    def transform(self, X):
        # Transform the data using the original ColumnTransformer
        X_array = super().transform(X)

        # Get the output feature names
        feature_names = self.get_feature_names_out()

        # Convert the array to a DataFrame
        return pd.DataFrame(X_array, columns=feature_names, index=X.index)

    def fit_transform(self, X, y=None):
        # Fit and transform the data and convert to DataFrame in one step
        X_array = super().fit_transform(X, y)

        # Get the output feature names
        feature_names = self.get_feature_names_out()

        # Convert the array to a DataFrame
        return pd.DataFrame(X_array, columns=feature_names, index=X.index)
    
class SimpleCustomPipeline(Pipeline):
    def get_feature_names_out(self, input_features=None):
        """Get output feature names for transformation."""
        # Check if the pipeline has a final step that is a transformer
        if hasattr(self.steps[-1][1], 'get_feature_names_out'):
            # If the last step is a transformer with the method 'get_feature_names_out'
            return self.steps[-1][1].get_feature_names_out()
        else:
            raise AttributeError("The last step of the pipeline does not support 'get_feature_names_out'.")

            
def format_date(data):
    # Convert the 'Timestamp' column to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], utc=True)

    # Extracting the relevant components directly from the datetime object
    data["Year"] = data['Timestamp'].dt.year
    data["Month"] = data['Timestamp'].dt.month
    data["Day"] = data['Timestamp'].dt.day
    data["Hour"] = data['Timestamp'].dt.hour  # Extracting just the hour

    return data
            
class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # This transformer does not need to learn anything from the data,
        # so the fit method just returns self.
        self.column_names = X.columns.tolist() + ["Year", "Month", "Day", "Hour"]
        return self

    def transform(self, X):
        data = X.copy()
        # Check if X is a DataFrame
        if not isinstance(data, pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame")

        # Ensure 'Timestamp' column is present
        if 'Timestamp' not in data.columns:
            raise ValueError("DataFrame must contain 'Timestamp' column for DateTransformer")

        # Convert 'Timestamp' to datetime and extract components
        data = format_date(data)

        return data
    
    def get_feature_names_out(self):
        return self.column_names
    
    
class StepTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, hours:int, feature_name:str, new_feature_name:str):
        self.hours = hours
        self.feature_name = feature_name
        self.new_feature_name =new_feature_name 

    def fit(self, X, y=None):
        # This transformer does not need to learn anything from the data,
        # so the fit method just returns self.
        return self

    def transform(self, X):
        data = X.copy()
        # Check if X is a DataFrame
        if not isinstance(data, pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame")
            
        time_stamp_name = 'Timestamp'
        feature_name = self.feature_name


        # Ensure 'Timestamp' column is present

        if time_stamp_name not in data.columns:
            raise ValueError(f"DataFrame must contain {time_stamp_name} column for StepTransformer")

        # Convert 'Timestamp' to datetime if not already
        data[time_stamp_name] = pd.to_datetime(data[time_stamp_name], utc=True)

        # Compare current timestamp with the one 'steps' ahead
        future = data.shift(-self.hours)
        return data[feature_name]

    
class WeatherConditionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name, new_feature_name):
        self.label_encoder = LabelEncoder()
        self.feature_name = feature_name
        self.new_feature_name = new_feature_name
        self.no_rain_definition = {"snow": "no_rain", "clear": "no_rain"}

    def fit(self, X, y=None):
        data = X.copy()
        # Check if X is a DataFrame
        if not isinstance(data , pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame")

        # Ensure 'Weather_conditions' column is present
        if self.feature_name not in data.columns:
            raise ValueError(f"DataFrame must contain {self.feature_name} column")

        # Fitting the LabelEncoder
        data[self.feature_name].fillna(method='ffill', inplace=True)
        data[self.feature_name].replace(self.no_rain_definition, inplace=True)
        self.label_encoder.fit(data[self.feature_name])

        return self

    def transform(self, X):
        # Performing the transformation
        data = X.copy()
        data[self.feature_name].fillna(method='ffill', inplace=True)
        data[self.feature_name].replace(self.no_rain_definition, inplace=True)
        encoded_weather = self.label_encoder.transform(data[self.feature_name])
        data[self.new_feature_name] = encoded_weather
        data.drop([self.feature_name], axis=1, inplace=True)

        return data
    
class RemoveNaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X=None, y=None):
        return self

    def transform(self, X):
        # Performing the transformation
        data = X.copy()
        data.dropna(inplace=True)

        return data
    
class RemoveNoFuture(BaseEstimator, TransformerMixin):
    
    def __init__(self, hours:int):
        self.hours = hours
        
    def fit(self, X, y=None):
        # This transformer does not need to learn anything from the data,
        # so the fit method just returns self.
        return self
    
    def transform(self, X):
        data = X.copy()
        # Check if X is a DataFrame
        if not isinstance(data, pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame")
            
        time_stamp_name = 'Timestamp'


        # Ensure 'Timestamp' column is present
        if time_stamp_name not in data.columns:
            raise ValueError(f"DataFrame must contain {time_stamp_name} column")

        # Convert 'Timestamp' to datetime if not already
        data[time_stamp_name] = pd.to_datetime(data[time_stamp_name], utc=True)

        # Compare current timestamp with the one 'steps' ahead
        future = data.shift(-self.hours)
        
        is_future_exist = (future[time_stamp_name] - data[time_stamp_name]) == pd.Timedelta(hours=self.hours)
        data = data[is_future_exist]
        

        return data
    
    

In [4]:



categorical_cols = ['Current_weather_condition', 'Month']
numerical_cols = ['Temperature_C', 'Apparent_Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars']

# For one-hot encoding of categorical columns
categorical_transformer = SimpleCustomPipeline([
    ('imputer', TransformerToDataFrame(SimpleImputer(strategy='most_frequent'))),  # Handle missing values if any
    ('onehot', OneHotEncoderDataFrame(handle_unknown='ignore'))
])

# For scaling numerical columns
numerical_transformer = SimpleCustomPipeline([
    ('imputer', TransformerToDataFrame(SimpleImputer(strategy='mean'))),  # Handle missing values if any
    ('scaler', TransformerToDataFrame(StandardScaler()))
])

date_transformer = DateTransformer()

target_transformer = Pipeline([
    ('weather', WeatherConditionTransformer('Weather_conditions', 'Current_weather_condition')),
    ('step', StepTransformer(4, 'Current_weather_condition', 'Future_weather_condition'))
])


preprocessor =  DataFrameColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols),
    ])

training_data_filter = Pipeline([
    ('remove_na', RemoveNaTransformer()),
    ('remove_no_futur',RemoveNoFuture(hours=4))
])


In [5]:
final_preprocessor = SimpleCustomPipeline([
        ('time', date_transformer),
        ('weather', WeatherConditionTransformer('Weather_conditions', 'Current_weather_condition')),
        ('basic', preprocessor),
    ])

In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'param_grid': {
            'model__n_estimators': [5, 10],
            'model__max_depth': [None, 5, 10]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(),
        'param_grid': {
            'model__n_estimators': [50, 100],
            'model__learning_rate': [0.1, 0.01]
        }
    }
}

In [7]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], utc=True)
df.sort_values(by='Timestamp', ascending=True, inplace=True)
df_training = df.iloc[:77160]
df_training = training_data_filter.fit_transform(df_training)


In [9]:
final_preprocessor

In [10]:

for model_name, model_config in models.items():
    model = model_config['model']
    param_grid = model_config['param_grid']
    
    y = target_transformer.fit_transform(df_training)
    
    
    # Create a pipeline for the current model
    model_pipeline = Pipeline([
        ('preprocessor', final_preprocessor),
        ('model', model)
    ])
    
    # Create a GridSearchCV object for the current model
    grid_search = GridSearchCV(model_pipeline, param_grid, cv=5)
    
    # Fit the GridSearchCV object to your training data
    grid_search.fit(df_training, y)
    print(grid_search.best_estimator_)

Pipeline(steps=[('preprocessor',
                 SimpleCustomPipeline(steps=[('time', DateTransformer()),
                                             ('weather',
                                              WeatherConditionTransformer(feature_name='Weather_conditions',
                                                                          new_feature_name='Current_weather_condition')),
                                             ('basic',
                                              DataFrameColumnTransformer(transformers=[('cat',
                                                                                        SimpleCustomPipeline(steps=[('imputer',
                                                                                                                     TransformerToDataFrame(base_transformer=SimpleI...
                                                                                        SimpleCustomPipeline(steps=[('imputer',
                               

In [None]:
param_grid = {
    'model__n_estimators': [100, 200],  # Example hyperparameters for RandomForestClassifier
    'model__max_depth': [None, 10, 20]
}

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Apply transformations
    ('model', model)  # Apply the model to the preprocessed data
])

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

grid_search.fit(X_train, y_train)

In [40]:
drop_feature_input =["Timestamp"]




df = formate_date(df)
df = df.drop(columns=drop_feature_input, axis=1)

# Spliting Pre-Processed data into Training and Validation datasets

In [41]:
# Validation set is used later to evaluate model performance post training. 

In [42]:
df_training = df.iloc[:77160]

In [43]:
df_training.shape

(77160, 15)

In [44]:
df_validation = df.drop(df_training.index)

In [45]:
df_validation.shape

(19289, 15)

# Registering Training and Validation data to the datastore on the workspace. 

In [9]:
!mkdir Data

Un sous-r‚pertoire ou un fichier Data existe d‚j….


In [10]:
df_training.to_csv('Data/training_data.csv', index=False)

In [11]:
df_validation.to_csv('Data/validation_data.csv', index=False)

# Data ingestion step - Training dataset

In [46]:
df_training.head()

Unnamed: 0,S_No,Location,Temperature_C,Apparent_Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition,Time,Year,Month,Day
4,4,"Port of Turku, Finland",8.755556,6.977778,0.83,11.0446,259,15.8263,1016.51,1,1,2,2006,4,1
5,5,"Port of Turku, Finland",9.222222,7.111111,0.85,13.9587,258,14.9569,1016.66,1,1,3,2006,4,1
6,6,"Port of Turku, Finland",7.733333,5.522222,0.95,12.3648,259,9.982,1016.72,1,1,4,2006,4,1
7,7,"Port of Turku, Finland",8.772222,6.527778,0.89,14.1519,260,9.982,1016.84,1,1,5,2006,4,1
8,8,"Port of Turku, Finland",10.822222,10.822222,0.82,11.3183,259,9.982,1017.37,1,1,6,2006,4,1


In [47]:
df_training.shape

(77160, 15)

#### Feature Selection and scaling

In [48]:
X = df_training[['Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars', 'Current_weather_condition']].values
y = df_training['Future_weather_condition'].values
y

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [49]:
# Splitting the Training dataset into Train and Test set for ML training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [50]:
sc = StandardScaler()

In [51]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model training and Testing Step

## 1. Support Vector Machine

In [61]:
parameters = {'kernel':['linear'], 'C':[1, 10]}

In [62]:
svc = svm.SVC()

In [63]:
svc_grid = GridSearchCV(svc, parameters)

In [64]:
%%time
svc_grid.fit(X_train, y_train)

CPU times: total: 4min 26s
Wall time: 4min 27s


In [65]:
svc_grid.get_params(deep=True)

{'cv': None,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(),
 'n_jobs': None,
 'param_grid': {'kernel': ['linear'], 'C': [1, 10]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [66]:
svc = SVC(C=svc_grid.get_params(deep=True)['estimator__C'], kernel=svc_grid.get_params(deep=True)['estimator__kernel'])

In [67]:
svc.fit(X_train, y_train)

In [68]:
predicted_svc = svc.predict(X_test)

In [69]:
acc = accuracy_score(y_test, predicted_svc)
fscore = f1_score(y_test, predicted_svc, average="macro")
precision = precision_score(y_test, predicted_svc, average="macro")
recall = recall_score(y_test, predicted_svc, average="macro")
print(f"Test accuracy :{acc}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-Score: {fscore}")

Test accuracy :0.9519180922757906
Precision: 0.8869828453699851
Recall: 0.8859050416892464
F-Score: 0.8864428755463128


## Random Forest classifier 

In [70]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)

In [71]:
%%time
rf.fit(X_train, y_train)

CPU times: total: 4.78 s
Wall time: 4.77 s


In [72]:
%%time
rf.fit(X_train, y_train)

CPU times: total: 4.73 s
Wall time: 4.75 s


In [73]:
predicted_rf = rf.predict(X_test)

In [74]:
acc = accuracy_score(y_test, predicted_rf)
fscore = f1_score(y_test, predicted_rf, average="macro")
precision = precision_score(y_test, predicted_rf, average="macro")
recall = recall_score(y_test, predicted_rf, average="macro")
print(f"Test accuracy: {acc}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-Score: {fscore}")

Test accuracy: 0.9548989113530326
Precision: 0.9018705246237031
Recall: 0.8804084310202218
F-Score: 0.8907272822498857


# Model Packaging Step

In [33]:
# Convert into SVC model into ONNX format file
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(svc, initial_types=initial_type)
with open("outputs/svc.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [34]:
# Convert into RF model into ONNX format file
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(rf, initial_types=initial_type)
with open("outputs/rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

# Save model artefacts

In [35]:
with open('./outputs/scaler.pkl', 'wb') as scaler_pkl:
    pickle.dump(sc, scaler_pkl)