In [9]:
import pickle
import pandas as pd
from io import StringIO
from sklearn.base import is_classifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient

In [10]:
# Azure Blob Storage Configuratie
STORAGE_ACCOUNT_NAME = "bdtscraper"
CONTAINER_NAME = "csv-files"
BLOB_FILES = ["student-mat.csv", "student-por.csv"]

# Verbinden met Azure Blob Storage via DefaultAzureCredential
def get_blob_service_client():
    credential = DefaultAzureCredential()
    return BlobServiceClient(
        account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net", 
        credential=credential
    )

In [11]:
# Lees de datasets in vanuit Azure Blob Storage
def load_data_from_blob(blob_name):
    """Laadt een CSV-bestand vanuit Azure Blob Storage in als een Pandas DataFrame."""
    blob_service_client = get_blob_service_client()
    blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=blob_name)
    
    blob_data = blob_client.download_blob().readall().decode("utf-8")
    return pd.read_csv(StringIO(blob_data), sep=";")

datasets = {name.split(".")[0]: load_data_from_blob(name) for name in BLOB_FILES}

# Converteer de target naar een binaire classificatie
for name, data in datasets.items():
    data['G3_class'] = (data['G3'] >= 10).astype(bool)

In [12]:
# Preprocess data
first_dataset = next(iter(datasets.values()))
first_dummies = pd.get_dummies(first_dataset, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    first_dummies.drop(columns=['G3', 'G3_class']),
    first_dummies['G3_class'],
    test_size=0.2,
    random_state=42
)

In [14]:
# Modelbeheer
class Readwritemodel():
    def __init__(self, model=None, filenaam=None):
        self.filenaam = filenaam
        self.model = model  
    
    def check_input(self, var):
        return isinstance(var, str) and var.endswith(".sav")

    def opslaanmodel(self):
        if self.check_input(self.filenaam):
            pickle.dump(self.model, open(self.filenaam, 'wb'))
            print(f"Jouw model is opgeslagen onder de naam: {self.filenaam}")
        
    def inladenmodel(self):
        if self.check_input(self.filenaam):
            try:
                geladen_model = pickle.load(open(self.filenaam, 'rb'))
                if is_classifier(geladen_model):
                    print("Jouw model is opgehaald en je kan er nu mee voorspellen!")
                    return geladen_model
            except FileNotFoundError:
                print("Het modelbestand kan niet gevonden worden. Er wordt een nieuw model getraind.")
        return None

In [15]:
# Modelbeheer
func = Readwritemodel(filenaam="classificatie_model.sav")
model = func.inladenmodel()

if model is None:
    print("Ik train een model...")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    func.model = model
    func.opslaanmodel()
else:
    print("Je gebruikt nu een opgeslagen model")

Jouw model is opgehaald en je kan er nu mee voorspellen!
Je gebruikt nu een opgeslagen model


In [18]:
# Loop door beide datasets en maak voorspellingen
for name, data in datasets.items():
    print(f"Verwerken van dataset: {name}")
    data_dummies = pd.get_dummies(data, drop_first=True)
    X = data_dummies.drop(columns=['G3', 'G3_class'])
    
    # Controleer of de kolommen overeenkomen met de trainingsdata
    missing_cols = set(X_train.columns) - set(X.columns)
    extra_cols = set(X.columns) - set(X_train.columns)
    
    for col in missing_cols:
        X[col] = 0
    
    X = X[X_train.columns]
    
    y_pred = model.predict(X)
    data['Predictions'] = y_pred
    
    output_filename = f"{name}_classification_predictions.csv"
    data.to_csv(output_filename, sep=";", index=False)
    print(f"Voorspellingen opgeslagen in {output_filename}")

Verwerken van dataset: student-mat
Voorspellingen opgeslagen in student-mat_classification_predictions.csv
Verwerken van dataset: student-por
Voorspellingen opgeslagen in student-por_classification_predictions.csv


In [17]:
# Evaluatie op de testset
accuracy = accuracy_score(y_test, model.predict(X_test))
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, model.predict(X_test)))
print("Classification Report:")
print(classification_report(y_test, model.predict(X_test)))

Accuracy: 0.9240506329113924
Confusion Matrix:
[[25  2]
 [ 4 48]]
Classification Report:
              precision    recall  f1-score   support

       False       0.86      0.93      0.89        27
        True       0.96      0.92      0.94        52

    accuracy                           0.92        79
   macro avg       0.91      0.92      0.92        79
weighted avg       0.93      0.92      0.92        79

