In [23]:
import os
import re
import string
import warnings
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import dagshub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import scipy.sparse

In [24]:
df = pd.read_csv('IMDB.csv')
df = df.sample(500)
df.to_csv('data.csv', index=False)
df.head()

Unnamed: 0,review,sentiment
848,i stopped this movie at 48 minutes and change....,negative
446,First and foremost this movie has the stupides...,negative
585,Quentin Tarantino's partner in crime Roger Ava...,negative
527,Here is another great film critics will love. ...,negative
567,"Still haven't read a single Dan Brown book, bu...",positive


In [25]:
CONFIG = {
    "mlflow_tracking_uri": "https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow",
    "dagshub_repo_owner": "i-atul",
    "dagshub_repo_name": "mlops-imdb-sentiment-analysis",
    "experiment_name": "All Experiments",
    "data_path": "data.csv",
    "test_size": 0.2,
}

In [26]:
def init_mlflow():
    mlflow.set_tracking_uri(CONFIG["mlflow_tracking_uri"])
    dagshub.init(repo_owner=CONFIG["dagshub_repo_owner"], repo_name=CONFIG["dagshub_repo_name"], mlflow=True)
    mlflow.set_experiment(CONFIG["experiment_name"])


In [27]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", ' ', text)
    text = ''.join([char for char in text if not char.isdigit()])
    text = " ".join([word for word in text.split() if word not in stop_words])
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [28]:
def normalize_text(df):
    try:
        df['review'] = df['review'].apply(preprocess_text)
        return df
    except Exception as e:
        print(f"Error during text normalization: {e}")
        raise

In [29]:
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        df = normalize_text(df)
        df = df[df['sentiment'].isin(['positive', 'negative'])]
        df['sentiment'] = df['sentiment'].replace({'negative': 0, 'positive': 1}).infer_objects(copy=False)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

In [30]:
VECTORIZERS = {
    'BoW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

In [31]:
ALGORITHMS = {
    'LogisticRegression': (LogisticRegression(), {
        "C": [0.1, 1, 10],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear"]
    }),
    'MultinomialNB': (MultinomialNB(), {
        "alpha": [0.5, 1.0, 1.5]
    }),
    'XGBoost': (XGBClassifier(), {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6]
    }),
    'RandomForest': (RandomForestClassifier(), {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20]
    }),
    'GradientBoosting': (GradientBoostingClassifier(), {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6]
    })
}

In [32]:
def train_and_evaluate(df):
    with mlflow.start_run(run_name="All Experiments") as parent_run:
        for algo_name, (algorithm, param_grid) in ALGORITHMS.items():
            for vec_name, vectorizer in VECTORIZERS.items():
                with mlflow.start_run(run_name=f"{algo_name} with {vec_name}", nested=True) as child_run:
                    try:
                        X = vectorizer.fit_transform(df['review'])
                        y = df['sentiment']
                        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=CONFIG["test_size"], random_state=42)

                        mlflow.log_params({
                            "vectorizer": vec_name,
                            "algorithm": algo_name,
                            "test_size": CONFIG["test_size"]
                        })

                        grid_search = GridSearchCV(algorithm, param_grid, cv=5, scoring="f1", n_jobs=-1)
                        grid_search.fit(X_train, y_train)

                        for params, mean_score, std_score in zip(grid_search.cv_results_["params"], 
                                                                 grid_search.cv_results_["mean_test_score"], 
                                                                 grid_search.cv_results_["std_test_score"]):
                            with mlflow.start_run(run_name=f"{algo_name} with params: {params}", nested=True):
                                model = algorithm.set_params(**params)
                                model.fit(X_train, y_train)
                                
                                y_pred = model.predict(X_test)
                                
                                metrics = {
                                    "accuracy": accuracy_score(y_test, y_pred),
                                    "precision": precision_score(y_test, y_pred),
                                    "recall": recall_score(y_test, y_pred),
                                    "f1_score": f1_score(y_test, y_pred),
                                    "mean_cv_score": mean_score,
                                    "std_cv_score": std_score
                                }
                                
                                mlflow.log_params(params)
                                mlflow.log_metrics(metrics)
                                
                                print(f"Params: {params} | Accuracy: {metrics['accuracy']:.4f} | F1: {metrics['f1_score']:.4f}")

                        best_params = grid_search.best_params_
                        best_model = grid_search.best_estimator_
                        best_f1 = grid_search.best_score_

                        mlflow.log_params(best_params)
                        mlflow.log_metric("best_f1_score", best_f1)
                        input_example = X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5].toarray()
                        mlflow.sklearn.log_model(best_model, "model", input_example=input_example)
                        
                        print(f"\nBest Params: {best_params} | Best F1 Score: {best_f1:.4f}")

                    except Exception as e:
                        print(f"Error in training {algo_name} with {vec_name}: {e}")
                        mlflow.log_param("error", str(e))


In [33]:
if __name__ == "__main__":
    warnings.simplefilter("ignore", UserWarning)
    warnings.filterwarnings("ignore")
    pd.set_option('future.no_silent_downcasting', True)
    
    init_mlflow()
    df = load_data(CONFIG["data_path"])
    train_and_evaluate(df)

Params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} | Accuracy: 0.6300 | F1: 0.6186
🏃 View run LogisticRegression with params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/5075a561e36142318979244c8bd36101
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} | Accuracy: 0.7900 | F1: 0.7640
🏃 View run LogisticRegression with params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/65703c1ac04244ca9acd9b4c633200ca
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} | Accuracy: 0.7200 | F1: 0.7021
🏃 View run LogisticRegression with params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} a

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'} | Best F1 Score: 0.7337
🏃 View run LogisticRegression with BoW at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/e35e20922af44596b03886a60418b798
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} | Accuracy: 0.5400 | F1: 0.0000
🏃 View run LogisticRegression with params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/86e9ab8d83c34c12bbaec4782a5132a9
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} | Accuracy: 0.5800 | F1: 0.1600
🏃 View run LogisticRegression with params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-ana

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'} | Best F1 Score: 0.7782
🏃 View run LogisticRegression with TF-IDF at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/5909466e33364f93bfdf9c08f17bb872
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'alpha': 0.5} | Accuracy: 0.7500 | F1: 0.7059
🏃 View run MultinomialNB with params: {'alpha': 0.5} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/7d7889b0bee34df18f1b1e2f56e3fb8a
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'alpha': 1.0} | Accuracy: 0.7500 | F1: 0.6988
🏃 View run MultinomialNB with params: {'alpha': 1.0} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/0cb24c2c758442209f6f205b21dfe904
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflo

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'alpha': 1.5} | Best F1 Score: 0.7552
🏃 View run MultinomialNB with BoW at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/641ba67817a84e53afd36da04ae91acc
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'alpha': 0.5} | Accuracy: 0.7400 | F1: 0.6579
🏃 View run MultinomialNB with params: {'alpha': 0.5} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/a72df44b61374adc9a3de549f5daa201
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'alpha': 1.0} | Accuracy: 0.7400 | F1: 0.6286
🏃 View run MultinomialNB with params: {'alpha': 1.0} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/812829d0f03643e7befc10ffcd66ebd0
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'alpha': 1.5} | 

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'alpha': 0.5} | Best F1 Score: 0.7202
🏃 View run MultinomialNB with TF-IDF at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/a1379ad1e71a468bb885917d69095743
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} | Accuracy: 0.6800 | F1: 0.6981
🏃 View run XGBoost with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/06b9b886ffe242ffa2d5b0f0df695d81
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} | Accuracy: 0.6500 | F1: 0.6392
🏃 View run XGBoost with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/expe

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100} | Best F1 Score: 0.7046
🏃 View run XGBoost with BoW at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/f15a6df5d41740788327fe2faae8493b
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} | Accuracy: 0.6400 | F1: 0.6327
🏃 View run XGBoost with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/925abd8a90564bf1af65de9a61d1cde3
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} | Accuracy: 0.6600 | F1: 0.6531
🏃 View run XGBoost with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} at: https://dagshub.com/i-atul/mlops-i

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100} | Best F1 Score: 0.7080
🏃 View run XGBoost with TF-IDF at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/767ff8e25245447a9e9d28eea4a3b6ce
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'max_depth': None, 'n_estimators': 100} | Accuracy: 0.7600 | F1: 0.7143
🏃 View run RandomForest with params: {'max_depth': None, 'n_estimators': 100} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/93af3578be584112923de6e5d7a7bde1
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'max_depth': None, 'n_estimators': 200} | Accuracy: 0.8000 | F1: 0.7619
🏃 View run RandomForest with params: {'max_depth': None, 'n_estimators': 200} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/75a89541b3ca472e

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'max_depth': 10, 'n_estimators': 100} | Best F1 Score: 0.7242
🏃 View run RandomForest with BoW at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/7fea819dea3a4f8c9fb4e1289777fe2c
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'max_depth': None, 'n_estimators': 100} | Accuracy: 0.7700 | F1: 0.7473
🏃 View run RandomForest with params: {'max_depth': None, 'n_estimators': 100} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/1ca754680907417fb2cb47b34ccd8073
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'max_depth': None, 'n_estimators': 200} | Accuracy: 0.8100 | F1: 0.8000
🏃 View run RandomForest with params: {'max_depth': None, 'n_estimators': 200} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/3752b607656a40a1b34f38493465f051
🧪 

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'max_depth': None, 'n_estimators': 200} | Best F1 Score: 0.7426
🏃 View run RandomForest with TF-IDF at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/d3a4c911f7e04c08a46ce2eb47130bd2
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} | Accuracy: 0.6700 | F1: 0.6857
🏃 View run GradientBoosting with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/bd1ceb1f336e4465a1ac25e076b35672
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} | Accuracy: 0.6600 | F1: 0.6531
🏃 View run GradientBoosting with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} at: https://dagshub.com/i-atul/

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100} | Best F1 Score: 0.6800
🏃 View run GradientBoosting with BoW at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/3313cdde20c347b7a523bfbb6dbf8573
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} | Accuracy: 0.6500 | F1: 0.6154
🏃 View run GradientBoosting with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/7d2bea1495444056b294589d0efe089e
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} | Accuracy: 0.6500 | F1: 0.6316
🏃 View run GradientBoosting with params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} at: https:/

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]


Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200} | Best F1 Score: 0.7010
🏃 View run GradientBoosting with TF-IDF at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/4f1a3abdd7a844bb9774349f191d1f98
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
🏃 View run All Experiments at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1/runs/f02334ffc9fe4f7992d4aa58d2374e8a
🧪 View experiment at: https://dagshub.com/i-atul/mlops-imdb-sentiment-analysis.mlflow/#/experiments/1
