In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

data = pd.read_csv('preprocessed.csv')

In [3]:
# Define cities and temperature types
cities = ['DRESDEN', 'KASSEL', 'DUSSELDORF']
temp_types = ['temp_mean', 'temp_max', 'temp_min']

# Dictionary to store models
models = {}

In [9]:
for city in cities:
    for temp_type in temp_types:
        # Prepare features and target
        features = data.drop(columns=['DATE'] + [f'{city}_{t}' for t in temp_types])
        target = data[f'{city}_{temp_type}']
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
        
        grid = [
            {'n_estimators': [1, 10, 100, 1000], 
            'learning_rate': [0.001, 0.01, 0.1], 
            'subsample_for_bin': [200, 2000, 20000, 200000],
            'num_leaves': [10, 15, 20, 30, 45]},
            ]
        # Create and train the LightGBM model
        model = lgb.LGBMRegressor(n_estimators = 1000, learning_rate=0.1, subsample_for_bin=200000, num_leaves=12)
        model.fit(X_train, y_train)
        
        # Store the model
        models[f'{city}_{temp_type}'] = model
        
        # Predictions and evaluations
        predictions = model.predict(X_test)
        
        # Calculate and print mean squared error
        mse = mean_squared_error(y_test, predictions)
        print(f'Mean Squared Error for {city} {temp_type}: {mse}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4031
[LightGBM] [Info] Number of data points in the train set: 2923, number of used features: 24
[LightGBM] [Info] Start training from score 9.821587
Mean Squared Error for DRESDEN temp_mean: 2.3945900848391295
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4031
[LightGBM] [Info] Number of data points in the train set: 2923, number of used features: 24
[LightGBM] [Info] Start training from score 13.692268
Mean Squared Error for DRESDEN temp_max: 3.4981556332106933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000510 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4031


In [32]:
# Creating new data to test on custom input
new_data = {
    'DRESDEN_wind_speed': [3.5],
    'DRESDEN_wind_gust': [10.2],
    'DRESDEN_humidity': [0.85],
    'DRESDEN_global_radiation': [0.15],
    'DRESDEN_precipitation': [0.00],
    'DRESDEN_sunshine': [5.0],
    'DUSSELDORF_temp_mean': [10.0], 
    'DUSSELDORF_temp_max': [15.0],   
    'DUSSELDORF_temp_min': [5.0],   
    'DUSSELDORF_wind_speed': [4.0],
    'DUSSELDORF_wind_gust': [12.5],
    'DUSSELDORF_humidity': [0.80],
    'DUSSELDORF_global_radiation': [0.20],
    'DUSSELDORF_precipitation': [0.05],
    'DUSSELDORF_sunshine': [6.0],
    'KASSEL_temp_mean': [9.0],     
    'KASSEL_temp_max': [14.0],     
    'KASSEL_temp_min': [4.0],   
    'KASSEL_wind_speed': [2.8],
    'KASSEL_wind_gust': [11.0],
    'KASSEL_humidity': [0.88],
    'KASSEL_global_radiation': [0.18],
    'KASSEL_precipitation': [0.02],
    'KASSEL_sunshine': [4.5]
}


new_features = pd.DataFrame(new_data)

In [33]:
# Predict and print results for each city and temperature type
for city in cities:
    for temp_type in temp_types:
        model = models[f'{city}_{temp_type}']  # Retrieve the model
        predicted_temp = model.predict(new_features)
        print(f"Predicted {temp_type} for {city}: {predicted_temp[0]}")


Predicted temp_mean for DRESDEN: 9.024635003796599
Predicted temp_max for DRESDEN: 13.014866038809078
Predicted temp_min for DRESDEN: 6.059938809650521
Predicted temp_mean for KASSEL: 3.848618371312498
Predicted temp_max for KASSEL: 8.650193518588503
Predicted temp_min for KASSEL: 0.5373902165783453
Predicted temp_mean for DUSSELDORF: 10.751389483831108
Predicted temp_max for DUSSELDORF: 15.624842273030392
Predicted temp_min for DUSSELDORF: 5.26486236605847


test_size closer to 1 creates a larger MSE, which is not good. 

smaller subsample_for_data creates a larger MSE

changing n_estimators does not seem to drastically change the MSE

giving max_depth a limit like 3 makes MSE higher

changing boosting_type either gave an error or didn't change the MSE drastically.
