In [1]:
from datasets import load_dataset

spam_detection_dataset = load_dataset("Deysi/spam-detection-dataset")

In [2]:
spam_detection_dataset.set_format(type='pandas')

In [3]:
train_df = spam_detection_dataset['train'][:]
test_df = spam_detection_dataset['test'][:]

In [4]:
train_df.label.value_counts(normalize=True)

label
spam        0.504587
not_spam    0.495413
Name: proportion, dtype: float64

In [5]:
test_df.label.value_counts(normalize=True)

label
spam        0.504587
not_spam    0.495413
Name: proportion, dtype: float64

In [6]:
train_df = train_df.sample(1500, random_state=10)
test_df = test_df.sample(500, random_state=10)

In [7]:
conda install joblib

Retrieving notices: ...working... done
Channels:
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [8]:
from joblib import dump

train_df.to_csv('dataset/train_df.csv', index=False)
test_df.to_csv('dataset/test_df.csv', index=False)

In [9]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
from sentence_transformers import SentenceTransformer

# Load the model
sentence_model = SentenceTransformer('all-mpnet-base-v2', device='cpu')

In [11]:
train_embeddings = sentence_model.encode(train_df['text'].values, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [12]:
test_embeddings = sentence_model.encode(test_df['text'].values, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [13]:
dump(train_embeddings, 'dataset/embeddings/train_embeddings.joblib')
dump(test_embeddings, 'dataset/embeddings/test_embeddings.joblib')

['dataset/embeddings/test_embeddings.joblib']

In [14]:
# import randomforestclassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

In [15]:
clf = RandomForestClassifier()

In [16]:
clf.fit(train_embeddings, train_df['label'])

In [17]:
predict = clf.predict(test_embeddings)

In [18]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_df['label'], predict)

array([[246,   2],
       [  0, 252]])

In [19]:
from sklearn.metrics import classification_report

print(classification_report(test_df['label'], predict))

              precision    recall  f1-score   support

    not_spam       1.00      0.99      1.00       248
        spam       0.99      1.00      1.00       252

    accuracy                           1.00       500
   macro avg       1.00      1.00      1.00       500
weighted avg       1.00      1.00      1.00       500



In [20]:
pip install mlflow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install boto3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting boto3
  Downloading boto3-1.34.144-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.35.0,>=1.34.144 (from boto3)
  Downloading botocore-1.34.144-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.34.144-py3-none-any.whl (139 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
[?25hDownloading botocore-1.34.144-py3-none-any.whl (12.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.10.2-py3-none-any.whl (82 kB)
[2K  

In [None]:
!mlflow ui --backend-store-uri sqlite:///mlflow.db

In [26]:
!export MLFLOW_TRACKING_URI=http://ec2-16-170-226-90.eu-north-1.compute.amazonaws.com:5000/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
from mlflow.tracking import MlflowClient


MLFLOW_TRACKING_URI = "http://ec2-16-170-226-90.eu-north-1.compute.amazonaws.com:5000/"
MLFLOW_EXPERIMENT_NAME = "email-spam-detection-experiment"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# Create a new experiment
client.create_experiment(name=MLFLOW_EXPERIMENT_NAME)

'1'

In [28]:
pip install optuna_integration

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [29]:
import optuna
import mlflow
from optuna_integration.mlflow import MLflowCallback

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

<Experiment: artifact_location='s3://mlflow-artifacts-email/1', creation_time=1721202691321, experiment_id='1', last_update_time=1721202691321, lifecycle_stage='active', name='email-spam-detection-experiment', tags={}>

In [30]:
from sklearn.metrics import accuracy_score

def objective(trial):
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 5, 100, log=True)
    clf = RandomForestClassifier(max_depth=rf_max_depth, n_estimators=rf_n_estimators)
    clf.fit(train_embeddings, train_df['label'])

    predictions = clf.predict(test_embeddings)
    accuracy = accuracy_score(test_df['label'], predictions)
    return accuracy

mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name='accuracy',
    create_experiment=False,
)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, callbacks=[mlflc])

  mlflc = MLflowCallback(
[I 2024-07-17 07:52:08,305] A new study created in memory with name: no-name-ba3f159b-325f-43d8-9e36-3f9c51ed2689
[I 2024-07-17 07:52:08,423] Trial 0 finished with value: 0.98 and parameters: {'rf_max_depth': 11, 'rf_n_estimators': 5}. Best is trial 0 with value: 0.98.
[I 2024-07-17 07:52:10,445] Trial 1 finished with value: 0.974 and parameters: {'rf_max_depth': 13, 'rf_n_estimators': 10}. Best is trial 0 with value: 0.98.
[I 2024-07-17 07:52:12,655] Trial 2 finished with value: 0.994 and parameters: {'rf_max_depth': 4, 'rf_n_estimators': 33}. Best is trial 2 with value: 0.994.
[I 2024-07-17 07:52:14,427] Trial 3 finished with value: 0.99 and parameters: {'rf_max_depth': 3, 'rf_n_estimators': 10}. Best is trial 2 with value: 0.994.
[I 2024-07-17 07:52:17,353] Trial 4 finished with value: 1.0 and parameters: {'rf_max_depth': 4, 'rf_n_estimators': 96}. Best is trial 4 with value: 1.0.
[I 2024-07-17 07:52:19,668] Trial 5 finished with value: 0.99 and parameters:

In [31]:
study.best_value

1.0

In [32]:
# Search for runs in the experiment
# Get based on the best trial value with the lowest n_estimators
from mlflow.entities import ViewType

spam_detection_experiment=dict(mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME))
experiment_id=spam_detection_experiment['experiment_id']

best_run = client.search_runs( 
    experiment_ids=experiment_id,
    filter_string=f'metrics.accuracy = {study.best_value}',
    run_view_type= ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=['parameters.rf_n_estimators ASC']
)[0]

In [33]:
best_max_depth = int(best_run.data.params['rf_max_depth'])
best_n_estimators = int(best_run.data.params['rf_n_estimators'])

mlflow.log_params({'rf_max_depth': best_max_depth, 'rf_n_estimators': best_n_estimators})

best_clf = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth)
best_clf.fit(train_embeddings, train_df['label'])

best_predictions = best_clf.predict(test_embeddings)
accuracy = accuracy_score(test_df['label'], best_predictions)
mlflow.log_metric("accuracy", accuracy)

dump(best_clf, 'models/best_clf.joblib')
mlflow.sklearn.log_model(best_clf, artifact_path="models", registered_model_name='spam-detector')

S3UploadFailedError: Failed to upload /tmp/tmpgehs967l/model/conda.yaml to mlflow-artifacts-email/1/fed8ba904e2b41e4877a2355afaf931c/artifacts/models/conda.yaml: An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.

In [1]:
import mlflow
from mlflow import MlflowClient

In [2]:
mlflow_tracking_uri="http://127.0.0.1:5000"
mlflow_client = MlflowClient(mlflow_tracking_uri)

In [5]:
from sentence_transformers import SentenceTransformer
sentences = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences, show_progress_bar=False, batch_size=32)
len(embeddings)


768