In [1]:
# Install Optuna, a library for hyperparameter optimization
!pip install optuna

# Install scikit-learn, a library for machine learning, including models, metrics, and utilities
!pip install scikit-learn

# Install Flask, a lightweight web framework for creating APIs (e.g., serving the ML model)
!pip install flask

# Install MLflow, a tool for tracking machine learning experiments, along with scikit-learn integration
!pip install mlflow scikit-learn

# Install DVC (Data Version Control), a tool for versioning datasets and machine learning pipelines
!pip install dvc

# Install the DVC extension for Google Drive to store and version datasets in Google Drive
!pip install dvc-gdrive

# Install Docker support, allowing the use of Docker CLI for containerization and deployment
!pip install docker




Use DVC (Data Version Control) to version control a dataset used in your project.

In [2]:
# Initialize a new DVC (Data Version Control) repository in the current directory.
# The `-f` flag forces initialization without prompting for confirmation, 
# creating the necessary `.dvc/` directory and configuration files.
!dvc init -f

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

Save the Iris dataset to a CSV file:

In [4]:
# Import the Iris dataset from scikit-learn
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset, which contains features and target variables for classification
iris = load_iris()

# Create a DataFrame from the dataset with feature names as column headers
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Add the target variable (class labels) to the DataFrame for classification purposes
df['target'] = iris.target

# Save the DataFrame as a CSV file in the "data" directory
# The 'index=False' argument ensures that the index is not included in the saved CSV
df.to_csv('data/iris_data.csv', index=False)


Add it to DVC:

In [5]:
# Add the CSV file (Iris dataset) to DVC tracking. This creates a `.dvc` file 
# that contains metadata for versioning the dataset.
!dvc add data/iris_data.csv

# Add the generated `.dvc` file and `.gitignore` to Git's staging area.
# The `.gitignore` file ensures that the actual dataset (e.g., `iris_data.csv`) 
# is not stored in the Git repository but managed by DVC.
!git add data/iris_data.csv.dvc .gitignore

# Commit the changes to the Git repository with a message describing 
# the addition of the initial version of the Iris dataset.
!git commit -m "Add initial version of Iris dataset"


[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/iris_data.csv |0.00 [00:00,     ?f[A
                                                                                [A
![A
  0% Checking cache in '/Users/nithindsouza/My Learnings/MTech/Sem 3/Subjects/ML[A
                                                                                [A
![A
  0%|          |Adding data/iris_data.csv to cache    0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /Users/nithindsouza/My Le0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 40.02file/s][A

To track the changes with git, run:

	git add data/iris_data.csv.dvc data/.gitignore

To enable auto staging, run:

	dvc config core.autostage tr

Create a new version of the dataset:

In [6]:
import numpy as np

# Add random noise to the 'sepal width (cm)' column
# The noise is drawn from a normal distribution with mean 0 and standard deviation 0.1
# `size=len(df)` ensures that the noise has the same length as the DataFrame
df['sepal width (cm)'] += np.random.normal(0, 0.1, size=len(df))

# Save the modified DataFrame back to the same CSV file
# The 'index=False' ensures the index is not included in the saved CSV
df.to_csv('data/iris_data.csv', index=False)


Track the updated dataset:

In [7]:
# Add the updated CSV file to DVC tracking. This updates the `.dvc` metadata 
# to reflect the changes made to the dataset.
!dvc add data/iris_data.csv

# Add the updated `.dvc` file to Git's staging area. This ensures that 
# the changes to the dataset version tracked by DVC are recorded in Git.
!git add data/iris_data.csv.dvc

# Commit the changes to the Git repository with a message describing the update.
!git commit -m "Update the dataset"


[?25l[32m⠋[0m Checking graph                                       core[39m>
Adding...                                                                       
![A
Collecting files and computing hashes in data/iris_data.csv |0.00 [00:00,     ?f[A
                                                                                [A
![A
  0% Checking cache in '/Users/nithindsouza/My Learnings/MTech/Sem 3/Subjects/ML[A
                                                                                [A
![A
  0%|          |Adding data/iris_data.csv to cache    0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /Users/nithindsouza/My Le0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 65.78file/s][A

To track the changes with git, run:

	git add data/iris_data.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true
[0m[detached

In [8]:
# Display the commit history of the Git repository.
# This shows a chronological list of commits, including commit hashes, authors, 
# commit messages, and timestamps.
!git log


[33mcommit a53776adbb0d79a484f99b6928c78ecd5b67a77b[m[33m ([m[1;36mHEAD[m[33m)[m
Author: nithindsouza <nithind74@gmail.com>
Date:   Sun Jan 26 19:55:32 2025 +0530

    update the dataset

[33mcommit 3fac56f069f9b9c86e537a1a81f7140004ca937d[m
Author: nithindsouza <nithind74@gmail.com>
Date:   Sun Jan 26 19:54:56 2025 +0530

    Add initial version of Iris dataset

[33mcommit 6fd546e50f6b6bffeac9b1e5bd7b7afa18fb2c35[m[33m ([m[1;31morigin/nithindsouza-mlops-patch-1[m[33m)[m
Author: Nithin Dsouza <49356269+nithindsouza@users.noreply.github.com>
Date:   Sat Jan 25 14:34:33 2025 +0530

    initial version of IRIS data Model packaging , Flask application setup and Docker container setup

[33mcommit d5d5b3925a17267fbf45c99d7f7c3fc9c4711c1d[m
Author: kbhavyas <99944562+kbhavyas@users.noreply.github.com>
Date:   Sat Jan 25 10:25:26 2025 +0530

    README.md


Use Git and DVC to revert:

In [22]:
# Check out a specific commit in the Git repository using its hash.
# This moves the HEAD pointer to the specified commit, allowing you to view or work with the state of the repository at that point.
!git checkout a53776adbb0d79a484f99b6928c78ecd5b67a77b

# Return to the default branch (e.g., `main` or `master`) after viewing the specific commit.
# If no branch or commit is specified, this command will attempt to move back to the branch you were previously working on.
!git checkout



any of your branches:

  2a7304f Added ML FLOW and DVC

If you want to keep it by creating a new branch, this may be a good time
to do so with:

 git branch <new-branch-name> 2a7304f

HEAD is now at a53776a update the dataset


•⁠  ⁠Use MLflow to track experiments for a machine learning project.

•⁠  ⁠Record metrics, parameters, and results of at least three different model training runs.


In [10]:
import mlflow
import mlflow.sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the Iris dataset from the CSV file
iris = pd.read_csv("data/iris_data.csv")

# Separate features (X) and the target variable (y)
X = iris.drop(columns=["target"])  # Drop the target column to get features
y = iris["target"]  # Target variable

# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Loop through different values of the 'max_depth' parameter for experimentation
for max_depth in [2, 3, 4]:
    # Start an MLflow experiment run
    with mlflow.start_run():
        # Initialize and train the Decision Tree Classifier with the current 'max_depth'
        clf = DecisionTreeClassifier(max_depth=max_depth)
        clf.fit(X_train, y_train)

        # Predict on the test set and evaluate the model's accuracy
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log the hyperparameter ('max_depth') and evaluation metric ('accuracy') to MLflow
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_metric("accuracy", accuracy)

        # Log the trained model to MLflow
        mlflow.sklearn.log_model(clf, "decision_tree_model")

        # Print the results for the current experiment run
        print(f"Run completed for max_depth={max_depth}, accuracy={accuracy:.4f}")




Run completed for max_depth=2, accuracy=0.9778




Run completed for max_depth=3, accuracy=1.0000




Run completed for max_depth=4, accuracy=1.0000


In [11]:
# This command starts the MLflow tracking UI, allowing you to visually explore your experiments, metrics, and models.
!mlflow ui



[2025-01-26 19:55:58 +0530] [46714] [INFO] Starting gunicorn 23.0.0
[2025-01-26 19:55:58 +0530] [46714] [INFO] Listening at: http://127.0.0.1:5000 (46714)
[2025-01-26 19:55:58 +0530] [46714] [INFO] Using worker: sync
[2025-01-26 19:55:58 +0530] [46715] [INFO] Booting worker with pid: 46715
[2025-01-26 19:55:58 +0530] [46716] [INFO] Booting worker with pid: 46716
[2025-01-26 19:55:58 +0530] [46717] [INFO] Booting worker with pid: 46717
[2025-01-26 19:55:58 +0530] [46718] [INFO] Booting worker with pid: 46718
^C
[2025-01-26 19:56:01 +0530] [46714] [INFO] Handling signal: int
[2025-01-26 19:56:01 +0530] [46717] [INFO] Worker exiting (pid: 46717)
[2025-01-26 19:56:01 +0530] [46718] [INFO] Worker exiting (pid: 46718)
[2025-01-26 19:56:01 +0530] [46715] [INFO] Worker exiting (pid: 46715)
[2025-01-26 19:56:01 +0530] [46716] [INFO] Worker exiting (pid: 46716)


Hyperparameter Tuning and Model Training

• Use a library like Optuna or Scikit-learn’s GridSearchCV to perform hyperparameter tuning on a chosen model.

• Document the tuning process and the best parameters found.


In [12]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for RandomForestClassifier
    n_estimators = trial.suggest_int('n_estimators', 11, 200)  # Number of trees in the forest
    max_depth = trial.suggest_int('max_depth', 2, 20)  # Maximum depth of the tree, reduced range for small datasets
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)  # Minimum samples required to split an internal node

    # Train the Random Forest model with the suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42  # Set random seed for reproducibility
    )
    model.fit(X_train, y_train)  # Fit the model on the training data
    y_pred = model.predict(X_test)  # Predict on the test data
    return accuracy_score(y_test, y_pred)  # Return accuracy as the objective metric

# Run Optuna hyperparameter optimization to maximize the accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Run optimization for 20 trials

# Save the hyperparameter tuning results as a DataFrame
results_df = study.trials_dataframe()  # Convert the results into a DataFrame for analysis
results_df.to_csv("hyperparameter_tuning_report.csv", index=False)  # Save the results as a CSV file

# Train the best model with the best found hyperparameters
best_params = study.best_params  # Get the best parameters from the optimization process
print("Best parameters:", best_params)

# Train the model with the best hyperparameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Evaluate the model's accuracy on the test data
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy:", accuracy)

# Optional: Save the results DataFrame for further analysis
results_df.head()  # Display the top rows of the results DataFrame


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-01-26 19:56:05,539] A new study created in memory with name: no-name-e74d359e-e600-4e6a-be4b-b77559a461cc
[I 2025-01-26 19:56:05,575] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 63, 'max_depth': 6, 'min_samples_split': 3}. Best is trial 0 with value: 1.0.
[I 2025-01-26 19:56:05,631] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 116, 'max_depth': 15, 'min_samples_split': 8}. Best is trial 0 with value: 1.0.
[I 2025-01-26 19:56:05,646] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 29, 'max_depth': 4, 'min_samples_split': 7}. Best is trial 0 with value: 1.0.
[I 2025-01-26 19:56:05,673] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 50, 'max_depth': 17, 'min_samples_split': 10}. Best is trial 0 with value: 1.0.
[I 2025-01-26 19:56:05,689] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 28, 'max_depth': 6, 'min_samples_split': 10}. Best

Best parameters: {'n_estimators': 63, 'max_depth': 6, 'min_samples_split': 3}
Test accuracy: 1.0


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_min_samples_split,params_n_estimators,state
0,0,1.0,2025-01-26 19:56:05.539459,2025-01-26 19:56:05.575535,0 days 00:00:00.036076,6,3,63,COMPLETE
1,1,1.0,2025-01-26 19:56:05.576048,2025-01-26 19:56:05.631690,0 days 00:00:00.055642,15,8,116,COMPLETE
2,2,1.0,2025-01-26 19:56:05.632305,2025-01-26 19:56:05.646665,0 days 00:00:00.014360,4,7,29,COMPLETE
3,3,1.0,2025-01-26 19:56:05.647265,2025-01-26 19:56:05.673776,0 days 00:00:00.026511,17,10,50,COMPLETE
4,4,1.0,2025-01-26 19:56:05.674508,2025-01-26 19:56:05.689072,0 days 00:00:00.014564,6,10,28,COMPLETE


Step 2: Save the Model

In [13]:
import joblib

# Save the best model using joblib
joblib.dump(best_model, 'iris_model.pkl')

# Explanation:
# The `joblib.dump()` function is used to serialize (save) the trained model to a file.
# In this case, the `best_model` (RandomForestClassifier trained with the best hyperparameters)
# is saved to a file named 'iris_model.pkl'. You can load this file later to make predictions
# or perform further analysis without needing to retrain the model.


['iris_model.pkl']

•⁠  ⁠Package the best-performing model using tools like Docker and Flask.

•⁠  ⁠Create a Dockerfile and a simple Flask application to serve the model.

In [14]:
!python app.py

zsh:1: command not found: python


In [15]:
# Build a Docker image for the Iris model API using the Dockerfile in the current directory.
# The '-t' flag tags the image with the name 'iris-model-api', which allows you to easily refer to the image later.
# The '.' at the end specifies the build context, meaning Docker will look for the Dockerfile and related files in the current directory to build the image.
!docker build -t iris-model-api .


[1A[1B[0G[?25l[+] Building 0.0s (0/0)  docker:desktop-linux
[?25h[1A[0G[?25l[+] Building 0.0s (0/1)                                    docker:desktop-linux
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                    docker:desktop-linux
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 355B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9-slim         0.2s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (1/2)                                    docker:desktop-linux
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 355B                                       0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9-slim         0.3s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.5s (1/2)                                    docker:desktop-li

In [17]:
# Terminate any running processes related to 'mlflow' by killing the process that matches the string 'mlflow' in its command line.
# The '-f' flag allows 'pkill' to search for the string in the entire command line, not just the process name.
# This is useful for stopping an MLflow server or other related processes if they are running in the background.
!pkill -f mlflow


In [16]:
# List all running Docker containers on the system.
# The 'docker ps' command shows information about active containers, such as their container ID, image name, status, ports, and names.
# It's useful for checking which containers are currently running and managing their lifecycle.
!docker ps


CONTAINER ID   IMAGE          COMMAND           CREATED             STATUS             PORTS                    NAMES
94bdec37e442   159e6db5aef4   "python app.py"   About an hour ago   Up About an hour   0.0.0.0:5000->5000/tcp   trusting_moore


In [20]:
# Stop the running Docker container with the specified container ID '94bdec37e442'.
# The 'docker stop' command gracefully stops a container by sending a SIGTERM signal, followed by a SIGKILL if the container doesn't stop within the default timeout.
# This is useful for shutting down a container that is no longer needed or for freeing up system resources.
!docker stop 94bdec37e442


94bdec37e442


In [21]:
# Run a Docker container from the 'iris-model-api' image in detached mode.
# The '-d' flag runs the container in the background (detached mode), allowing the terminal to be free for other commands.
# The '-p 5000:5000' flag maps port 5000 on the host machine to port 5000 inside the container, which is where the Flask app is running.
# This makes the model API accessible via port 5000 on the host machine, enabling external access to the API.
!docker run -d -p 5000:5000 iris-model-api


42b8b03a7342d43017f95bfb491e926bca06881436a737d8e5188de6f2c91915
