In [11]:
import pandas as pd
weatherData = pd.read_csv('2020-2023/daily_data_combined_2020_to_2023.csv')
weatherData_2010_2019 = pd.read_csv('2010-2019/daily_data_combined_2010_to_2019.csv')

In [14]:
print(weatherData.columns)
print(weatherData_2010_2019.columns)
columns_not_in_weatherData = set(weatherData_2010_2019.columns) - set(weatherData.columns)
print(columns_not_in_weatherData)

Index(['city_name', 'datetime', 'weather_code', 'temperature_2m_max',
       'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max',
       'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise',
       'sunset', 'daylight_duration', 'sunshine_duration', 'precipitation_sum',
       'rain_sum', 'snowfall_sum', 'precipitation_hours', 'wind_speed_10m_max',
       'wind_gusts_10m_max', 'wind_direction_10m_dominant',
       'shortwave_radiation_sum', 'et0_fao_evapotranspiration'],
      dtype='object')
Index(['city_name', 'latitude', 'longitude', 'datetime', 'weather_code',
       'temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean',
       'apparent_temperature_max', 'apparent_temperature_min',
       'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration',
       'sunshine_duration', 'precipitation_sum', 'rain_sum', 'snowfall_sum',
       'precipitation_hours', 'wind_speed_10m_max', 'wind_gusts_10m_max',
       'wind_direction_10m_do

In [16]:
cities = pd.read_csv('2020-2023/cities.csv')
# Merge the weatherData with cities to include latitude and longitude
weatherData = weatherData.merge(cities, on='city_name', how='left')



In [None]:
# Convert the datetime column to a datetime object
weatherData['datetime'] = pd.to_datetime(weatherData['datetime'])



In [18]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

# Function to convert time to minutes since midnight
def time_to_minutes(time_str):
    time_obj = pd.to_datetime(time_str, format='%Y-%m-%dT%H:%M', errors='coerce')
    return time_obj.dt.hour * 60 + time_obj.dt.minute

# Convert sunrise and sunset columns to numerical values
weatherData['sunrise'] = time_to_minutes(weatherData['sunrise'])
weatherData['sunset'] = time_to_minutes(weatherData['sunset'])

# Separate features and label
X = weatherData.drop(columns=['weather_code'])
y = weatherData['weather_code']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create a preprocessing and training pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Print data types per column
print("Data types per column:")
print(weatherData.dtypes)

              precision    recall  f1-score   support

         0.0       1.00      0.01      0.02       103
         1.0       0.64      0.44      0.52      1294
         2.0       0.50      0.05      0.09      1150
         3.0       0.70      0.72      0.71      1380
        51.0       0.76      0.96      0.84     10178
        53.0       0.68      0.29      0.41      6111
        55.0       0.00      0.00      0.00      2409
        61.0       0.34      0.36      0.35      6533
        63.0       0.57      0.84      0.68     11527
        65.0       0.73      0.52      0.61      2853

    accuracy                           0.61     43538
   macro avg       0.59      0.42      0.42     43538
weighted avg       0.58      0.61      0.57     43538

Data types per column:
city_name                       object
datetime                        object
weather_code                   float64
temperature_2m_max             float64
temperature_2m_min             float64
temperature_2m_mean    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import joblib
import numpy as np
import os
import matplotlib.pyplot as plt

# Load the models from pcaModels
# Load all models from the pcaModels folder

pcaModels = []
models_folder = 'pcaModels'
for model_file in os.listdir(models_folder):
    if model_file.endswith('.joblib'):
        model_path = os.path.join(models_folder, model_file)
        pcaModels.append(joblib.load(model_path))
        print(f"Loaded model from {model_path}")

for i, model in enumerate(pcaModels):
    if isinstance(model, Pipeline):
        preprocessor = model.named_steps['preprocessor']
        num_features = preprocessor.transformers_[0][2]
        cat_features = preprocessor.transformers_[1][2]
        all_features = list(num_features) + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_features))
        print(f"Features for model {i+1}: {all_features}")
    else:
        print(f"Model {i+1} is not a Pipeline.")

        # Since none of the models are pipelines, we will just print the model types
        for i, model in enumerate(pcaModels):
            print(f"Model {i+1} is of type {type(model)}")


Loaded model from pcaModels\GradientBoostingClassifier.joblib
Loaded model from pcaModels\KNeighborsClassifier.joblib
Loaded model from pcaModels\LinearSVC.joblib
Loaded model from pcaModels\pca.joblib
Loaded model from pcaModels\preprocessor.joblib
Loaded model from pcaModels\RandomForestClassifier.joblib
Loaded model from pcaModels\SGDClassifier.joblib
Loaded model from pcaModels\SVC.joblib
Model 1 is not a Pipeline.
Model 1 is of type <class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
Model 2 is of type <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Model 3 is of type <class 'sklearn.svm._classes.LinearSVC'>
Model 4 is of type <class 'sklearn.decomposition._pca.PCA'>
Model 5 is of type <class 'sklearn.compose._column_transformer.ColumnTransformer'>
Model 6 is of type <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Model 7 is of type <class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
Model 8 is of type <class 'sklearn.svm._classes.SVC

In [25]:
# Convert each model in pcaModels to a pipeline
pipelineModels = []
for model in pcaModels:
    if not isinstance(model, Pipeline):
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        pipelineModels.append(pipeline)
    else:
        pipelineModels.append(model)

# Train each pipeline model
for pipeline in pipelineModels:
    pipeline.fit(X_train, y_train)

# Make predictions with each pipeline model
pipeline_predictions = [pipeline.predict(X_test.drop(columns=['city_name', 'datetime'])) for pipeline in pipelineModels]

# Calculate the deviations for each pipeline model
pipeline_deviations = [np.abs(pred - y_test.values) for pred in pipeline_predictions]

# Print the deviations
for i, deviation in enumerate(pipeline_deviations):
    print(f"Deviations for pipeline model {i+1}: {deviation}")

# Plot the deviations for each pipeline model
plt.figure(figsize=(10, 6))
for i, deviation in enumerate(pipeline_deviations):
    plt.plot(deviation, label=f'Pipeline Model {i+1}')

plt.xlabel('Sample Index')
plt.ylabel('Deviation')
plt.title('Deviations of Predictions from Actual Weather Code')
plt.legend()
plt.show()

ValueError: Specifying the columns using strings is only supported for dataframes.

In [None]:


# Extract the features for prediction (excluding the target variable 'weather_code')
columns_to_include = ['temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean', 'apparent_temperature_max', 
                      'apparent_temperature_min', 'apparent_temperature_mean', 'sunrise', 'sunset', 'daylight_duration', 
                      'sunshine_duration', 'precipitation_sum', 'rain_sum', 'snowfall_sum', 'precipitation_hours', 
                      'wind_speed_10m_max', 'wind_gusts_10m_max', 'wind_direction_10m_dominant', 'shortwave_radiation_sum', 
                      'et0_fao_evapotranspiration']

# Filter columns that exist in weatherData
columns_to_include = [col for col in columns_to_include if col in weatherData.columns]

# Fill missing values with 0
features = weatherData[columns_to_include].fillna(0)

# Predict the weather_code using each model in pcaModels
predictions = [model.predict(features) for model in pcaModels]

# Calculate the deviations for each model
deviations = [np.abs(pred - weatherData['weather_code'].values) for pred in predictions]

# Print the deviations
for i, deviation in enumerate(deviations):
    print(f"Deviations for model {i+1}: {deviation}")

# Plot the deviations for each model
plt.figure(figsize=(10, 6))
for i, deviation in enumerate(deviations):
    plt.plot(deviation, label=f'Model {i+1}')

plt.xlabel('Sample Index')
plt.ylabel('Deviation')
plt.title('Deviations of Predictions from Actual Weather Code')
plt.legend()
plt.show()

Loaded model from pcaModels\GradientBoostingClassifier.joblib
Loaded model from pcaModels\KNeighborsClassifier.joblib
Loaded model from pcaModels\LinearSVC.joblib
Loaded model from pcaModels\pca.joblib
Loaded model from pcaModels\preprocessor.joblib
Loaded model from pcaModels\RandomForestClassifier.joblib
Loaded model from pcaModels\SGDClassifier.joblib
Loaded model from pcaModels\SVC.joblib




ValueError: X has 19 features, but GradientBoostingClassifier is expecting 10 features as input.