In [1]:
import mlflow
import nltk
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [2]:
df=pd.read_csv('IMDB.csv')
df

Unnamed: 0,review,sentiment
0,Film version of Sandra Bernhard's one-woman of...,negative
1,I switched this on (from cable) on a whim and ...,positive
2,The `plot' of this film contains a few holes y...,negative
3,"Some amusing humor, some that falls flat, some...",negative
4,What can you say about this movie? It was not ...,negative
...,...,...
995,"Not exactly a new story line, but this romanti...",negative
996,I first saw this movie as a younger child. My ...,positive
997,Some people have stated that as of the 11th se...,positive
998,Nothing but the director's juvenile fantasy co...,negative


In [3]:
def lemmatization(text):
    lemm=WordNetLemmatizer()
    text=text.split()
    text=[lemm.lemmatize(word) for word in text]
    return " ".join(text)

def remove_sw(text):
    sw=set(stopwords.words('english'))
    text=[word for word in str(text).split() if word not in sw]
    return " ".join(text)

def remove_num(text):
    text=''.join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    text=text.split()
    text=[word.lower() for word in text]
    return " ".join(text)

def removing_punc(text):
    text=re.sub('[%s]' % re.escape(string.punctuation), ' ',text)
    text = text.replace('؛', "")
    text=re.sub('\s',' ',text).strip()
    return text

def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def normalise_text(df):
    try:
        df['review']=df['review'].apply(lower_case)
        df['review']=df['review'].apply(remove_sw)
        df['review']=df['review'].apply(remove_num)
        df['review']=df['review'].apply(removing_punc)
        df['review']=df['review'].apply(removing_urls)
        df['review']=df['review'].apply(lemmatization)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [6]:
df=normalise_text(df)
df.head()

Unnamed: 0,review,sentiment
0,film version sandra bernhard one woman broadwa...,negative
1,switched cable whim treated quite surprise alt...,positive
2,plot film contains hole could drive massive tr...,negative
3,amusing humor fall flat decent acting quite at...,negative
4,say movie terrible good two day earlier watche...,negative


In [7]:
df['sentiment'].value_counts()

sentiment
negative    517
positive    483
Name: count, dtype: int64

In [8]:
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})

In [None]:
# df.to_csv('data.csv',index=False)

In [7]:
import dagshub
dagshub.init(repo_owner='itsalok2', repo_name='nlp_end_to_end', mlflow=True)

In [8]:
mlflow.set_experiment("Logistic Regression Baseline")

<Experiment: artifact_location='mlflow-artifacts:/4790d107af0a4222930a42f52a66eed7', creation_time=1759923292487, experiment_id='0', last_update_time=1759923292487, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [13]:
import logging
import time
import os

config={
    'max_fea':400,
    'test_size':0.2,
    'max_iter':1000
}

vec=CountVectorizer(max_features=config['max_fea'])
x=vec.fit_transform(df['review'])
y=df['sentiment']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=config['test_size'],random_state=42)

logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(levelname)s - %(message)s")
logging.info('starting MLFlow runs....')

with mlflow.start_run():
    st=time.time()

    try:
        logging.info('logging preprocessing parameters')
        mlflow.log_param('vectorizer','tfidf vectorizer')
        mlflow.log_param('num_features',config['max_fea'])
        mlflow.log_param('test_size',config['test_size'])
        mlflow.log_param('max_iter',config['max_iter'])
        
        logging.info('initialising loginstic regression model')
        model=LogisticRegression(max_iter=config['max_iter'])

        logging.info('fitting the model')
        model.fit(x_train,y_train)
        logging.info('model training complete')

        logging.info('making predection')
        y_pred=model.predict(x_test)

        logging.info('calculating evaluation metrics')
        accuracy=accuracy_score(y_test,y_pred)
        precision=precision_score(y_test,y_pred)
        recall=recall_score(y_test,y_pred)
        f1=f1_score(y_test,y_pred)

        logging.info("Logging evaluation metrics...")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        logging.info('saving and logging the model')
        mlflow.sklearn.log_model(model,'model')

        et=time.time()
        logging.info(f"Model training and logging completed in {et - st:.2f} seconds.")

        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)

2025-10-08 19:28:48,608 - INFO - starting MLFlow runs....
2025-10-08 19:28:49,609 - INFO - logging preprocessing parameters
2025-10-08 19:28:50,895 - INFO - initialising loginstic regression model
2025-10-08 19:28:50,897 - INFO - fitting the model
2025-10-08 19:28:51,035 - INFO - model training complete
2025-10-08 19:28:51,036 - INFO - making predection
2025-10-08 19:28:51,038 - INFO - calculating evaluation metrics
2025-10-08 19:28:51,054 - INFO - Logging evaluation metrics...
2025-10-08 19:28:52,364 - INFO - saving and logging the model
2025-10-08 19:29:00,871 - INFO - Model training and logging completed in 11.26 seconds.
2025-10-08 19:29:00,872 - INFO - Accuracy: 0.765
2025-10-08 19:29:00,873 - INFO - Precision: 0.7692307692307693
2025-10-08 19:29:00,874 - INFO - Recall: 0.7291666666666666
2025-10-08 19:29:00,875 - INFO - F1 Score: 0.7486631016042781


🏃 View run crawling-grub-185 at: https://dagshub.com/itsalok2/nlp_end_to_end.mlflow/#/experiments/0/runs/19aec8b6e17241c5a920a31b5b867865
🧪 View experiment at: https://dagshub.com/itsalok2/nlp_end_to_end.mlflow/#/experiments/0


In [None]:
import optuna
import mlflow
import mlflow.xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import logging

# Split dataset
X_train, X_valid, y_train, y_valid = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# Enable MLflow autolog for XGBoost
mlflow.xgboost.autolog()

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def objective(trial):
    # Child run for each trial
    with mlflow.start_run(run_name=f"Trial_{trial.number}", nested=True):
        logging.info(f"Starting trial {trial.number}...")

        # Suggest hyperparameters including num_boost_round
        param = {
            "verbosity": 0,
            "objective": "binary:logistic",
            "eval_metric": "error",  # 1 - accuracy
            "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
            "gamma": trial.suggest_float("gamma", 1e-8, 10.0, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
        }
        num_boost_round = trial.suggest_int("num_boost_round", 50, 300)

        # Convert data to DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dvalid = xgb.DMatrix(X_valid, label=y_valid)

        # Train with early stopping on validation accuracy
        bst = xgb.train(
            param,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dtrain,'training'),(dvalid, "validation")],
            early_stopping_rounds=10,
            verbose_eval=True,
            eval_metric='error'
        )

        # Predict
        preds = bst.predict(dvalid)
        pred_labels = [1 if p > 0.5 else 0 for p in preds]

        # Compute metrics
        accuracy = accuracy_score(y_valid, pred_labels)
        precision = precision_score(y_valid, pred_labels)
        recall = recall_score(y_valid, pred_labels)
        f1 = f1_score(y_valid, pred_labels)

        # Log metrics explicitly
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

    return accuracy  # Optuna maximizes accuracy

# Parent run
with mlflow.start_run(run_name="XGBoost_Hyperparameter_Tuning_Parent"):
    logging.info("Starting Optuna hyperparameter tuning...")
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    logging.info("Hyperparameter tuning complete.")
    logging.info(f"Best trial parameters: {study.best_trial.params}")
    logging.info(f"Best accuracy: {study.best_value}")


2025-10-08 19:30:23,517 - INFO - Starting MLflow auto-logged run...
2025-10-08 19:30:23,983 - INFO - Initializing Logistic Regression model...
2025-10-08 19:30:23,984 - INFO - Fitting the model...
2025-10-08 19:30:38,739 - INFO - Model training complete.
2025-10-08 19:30:38,740 - INFO - Making predictions...
2025-10-08 19:30:39,104 - INFO - Calculating evaluation metrics...
2025-10-08 19:30:43,280 - INFO - Accuracy: 0.745, Precision: 0.7368421052631579, Recall: 0.7291666666666666, F1 Score: 0.7329842931937173
2025-10-08 19:30:43,281 - INFO - Training completed in 19.30 seconds.


🏃 View run flawless-fowl-187 at: https://dagshub.com/itsalok2/nlp_end_to_end.mlflow/#/experiments/0/runs/27c88be752cd4c8e9d95cc41ebde84f4
🧪 View experiment at: https://dagshub.com/itsalok2/nlp_end_to_end.mlflow/#/experiments/0
