In [1]:
import pickle
from sklearn.base import is_regressor
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient
import os

In [2]:
# Azure Blob Storage Configuratie
STORAGE_ACCOUNT_NAME = "bdtscraper"
CONTAINER_NAME = "csv-files"
BLOB_FILES = ["student-mat.csv", "student-por.csv"]

# Verbinden met Azure Blob Storage

def get_blob_service_client():
    try:
        credential = DefaultAzureCredential()
        return BlobServiceClient(
            account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net", 
            credential=credential
        )
    except Exception as e:
        print(f"Fout bij het maken van BlobServiceClient: {e}")
        return None

In [3]:
# Laadt een CSV-bestand vanuit Azure Blob Storage
def load_data_from_blob(blob_name):
    blob_service_client = get_blob_service_client()
    if blob_service_client is None:
        raise RuntimeError("BlobServiceClient kon niet worden geïnitialiseerd.")
    
    blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=blob_name)
    blob_data = blob_client.download_blob().readall().decode("utf-8")
    return pd.read_csv(StringIO(blob_data), sep=";")

# Datasets inladen
datasets = {name.split(".")[0]: load_data_from_blob(name) for name in BLOB_FILES}

target = 'G3'

In [None]:
# Preprocess data
def prepare_data(df, reference_columns=None):
    df_dummies = pd.get_dummies(df, drop_first=True)
    if reference_columns is not None:
        missing_cols = set(reference_columns) - set(df_dummies.columns)
        for col in missing_cols:
            df_dummies[col] = 0
        df_dummies = df_dummies[reference_columns]
    return df_dummies

first_dataset = next(iter(datasets.values()))
first_dummies = prepare_data(first_dataset)
reference_columns = first_dummies.columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    first_dummies.drop(columns=[target]), 
    first_dummies[target], 
    test_size=0.2, 
    random_state=42
)

In [5]:
# Model opslag / inladen
class Readwritemodel():
    def __init__(self, model=None, filenaam="regressie.sav"):
        self.filenaam = filenaam
        self.model = model  
    
    def check_input(self, var):
        return isinstance(var, str) and var.endswith(".sav")

    def opslaanmodel(self):
        if self.check_input(self.filenaam):
            with open(self.filenaam, 'wb') as file:
                pickle.dump(self.model, file)
            print(f"Model opgeslagen als: {self.filenaam}")
    
    def inladenmodel(self):
        if self.check_input(self.filenaam) and os.path.exists(self.filenaam):
            with open(self.filenaam, 'rb') as file:
                geladen_model = pickle.load(file)
                if is_regressor(geladen_model):
                    print("Opgeslagen model geladen!")
                    return geladen_model
                else:
                    print("Geen regressiemodel gevonden.")
        print("Geen model gevonden. Een nieuw model wordt getraind.")
        return None

In [6]:
# Modelbeheer
func = Readwritemodel()
model = func.inladenmodel()

if model is None:
    print("Trainen van een nieuw model...")
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)
    func.model = model
    func.opslaanmodel()

Opgeslagen model geladen!


In [7]:
# Loop door datasets en maak voorspellingen
for name, data in datasets.items():
    print(f"Verwerken van dataset: {name}")
    data_dummies = prepare_data(data, reference_columns)
    X = data_dummies.drop(columns=[target], errors="ignore")
    
    y_pred = model.predict(X)
    data['Predictions'] = y_pred
    
    output_filename = f"{name}_regression_predictions.csv"
    data.to_csv(output_filename, sep=";", index=False)
    print(f"Voorspellingen opgeslagen in {output_filename}")

Verwerken van dataset: student-mat
Voorspellingen opgeslagen in student-mat_regression_predictions.csv
Verwerken van dataset: student-por
Voorspellingen opgeslagen in student-por_regression_predictions.csv


In [8]:
# Evaluatie
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = r2_score(y_test, model.predict(X_test))
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Squared Error (MSE): 4.019045672650393
R-squared (R²): 0.8039972490622082
