In [None]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
import joblib

### The Student Performance Data Set
[The Student Performance](https://archive.ics.uci.edu/dataset/320/student+performance) Data Set is a collection of data gathered from secondary school students in Portugal. It was compiled to analyse the factors that influence academic success, particularly in mathematics and Portuguese language courses. The data was collected from two schools and includes a wide range of attributes related to student demographics, social and economic factors, and academic records.

### Load the data

In [2]:
# fetch dataset
student_performance = fetch_ucirepo(id=320)
# get df
student_performance_df = student_performance.data.original
# print df
#print(student_performance_df)
# write csv
#student_performance_df.to_csv('student_performance.csv')

### Explore and define the target data

In [None]:
# variable information
student_performance.variables

In [3]:
# Drop non-feature columns and ensure target isolation
target = 'G3'
exclude_features = ['G1', 'G2', 'Unnamed: 0', target]
features = [col for col in student_performance_df.columns if col not in exclude_features]

# Handle categorical variables via one-hot encoding
data_prepared = pd.get_dummies(student_performance_df[features])
X = data_prepared
y = student_performance_df[target]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Train a Random Forest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)



['random_forest_model.pkl']

## Push to GitHub
Push both:

* This .ipynb file, (or.py) to track how the model was trained.
* The models/ folder containing model artifacts and metadata.

In [25]:
import os
import joblib
import json
import datetime

def save_model_and_metadata(model, metrics, version, repo_name="MLops_demo", base_dir="/content/drive/MyDrive/01.RWML"):
    """
    Save model and metadata into the specified directory structure under the given base directory.

    Args:
        model: Trained machine learning model (e.g., RandomForestRegressor).
        metrics: Dictionary containing evaluation metrics (e.g., RMSE, accuracy).
        version: Model version (e.g., "v1", "v2").
        repo_name: Name of the repository (e.g., "MLops_demo").
        base_dir: Base directory where the repository resides.
    """
    # Construct the full repository path
    repo_dir = os.path.join(base_dir, repo_name)

    # Create paths for saving
    model_dir = os.path.join(repo_dir, version)
    os.makedirs(model_dir, exist_ok=True)

    # Save model
    model_path = os.path.join(model_dir, f"model_{version}.pkl")
    joblib.dump(model, model_path)

    # Save metadata
    metadata = {
        "model_version": version,
        "metrics": metrics,
        "timestamp": str(datetime.datetime.now())
    }
    metadata_path = os.path.join(model_dir, f"metadata_{version}.json")
    with open(metadata_path, "w") as f:
        json.dump(metadata, f)

    print(f"Model saved to {model_path}")
    print(f"Metadata saved to {metadata_path}")


In [26]:
metrics = {"rmse": rmse}  #
save_model_and_metadata(rf_model, metrics, version="v1")

Model saved to /content/drive/MyDrive/01.RWML/MLops_demo/v1/model_v1.pkl
Metadata saved to /content/drive/MyDrive/01.RWML/MLops_demo/v1/metadata_v1.json
