In [None]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
import string
import numpy as np
from dotenv import load_dotenv
import os
import logging
import time
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load environment variables and setup logging
load_dotenv()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

  import pkg_resources  # noqa: TID251


In [6]:
# Download required NLTK data
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lakshya\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lakshya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lakshya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [7]:
df = pd.read_csv('IMDB.csv')
df = df.sample(500)
df.to_csv('sample_data.csv', index=False)
df.head()

Unnamed: 0,review,sentiment
751,This is one of the best and moodiest Vampire T...,positive
369,Guys and Dolls is a unique play based on the c...,positive
267,An excellent documentry. I personally remember...,positive
672,USA The Movie is like this: You take a nap on ...,positive
57,This film is worthwhile despite what you may h...,positive


In [8]:
# Data Preprocessing

# Define text preprocessing function
def lemmatization(text):
    '''Lematize the text'''
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stopwords(text):
    '''Remove stop words from the text'''
    stop_words = set(stopwords.words('english'))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    '''Remove numbers from the text'''
    text = ''.join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    '''Convert the text to lower case'''
    return text.lower()

def remove_punctuation(text):
    '''Remove punctuation from the text'''
    return text.translate(str.maketrans('', '', string.punctuation))

def removing_url(text):
    '''Remove URLs from the text'''
    return re.sub(r'http\S+', '', text)

def normalize_text(text):
    '''Normalize the text'''
    try:
        df['review'] = df['review'].apply(lower_case)
        df['review'] = df['review'].apply(remove_stopwords)
        df['review'] = df['review'].apply(removing_numbers)
        df['review'] = df['review'].apply(remove_punctuation)
        df['review'] = df['review'].apply(removing_url)
        df['review'] = df['review'].apply(lemmatization)
        return df
    except Exception as e:
        print(f"Error in normalizing text: {e}")
        raise

In [9]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
751,one best moodiest vampire tale ever love movie...,positive
369,guy doll unique play based character sky maste...,positive
267,excellent documentry personally remember growi...,positive
672,usa movie like this take nap long hot sunday a...,positive
57,film worthwhile despite may hear performance m...,positive


In [10]:
df['sentiment'].value_counts()

sentiment
positive    250
negative    250
Name: count, dtype: int64

In [11]:
x = df['sentiment'].isin(['positive', 'negative'])
df = df[x]

In [12]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
751,one best moodiest vampire tale ever love movie...,1
369,guy doll unique play based character sky maste...,1
267,excellent documentry personally remember growi...,1
672,usa movie like this take nap long hot sunday a...,1
57,film worthwhile despite may hear performance m...,1


In [13]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [14]:
vectorizer = CountVectorizer(max_features=100)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [23]:
import dagshub

# Set up MLflow tracking
# Use the correct DagsHub tracking URI directly
tracking_uri = "https://dagshub.com/lakshya-hidau/MLOps-Learning.mlflow"

try:
    mlflow.set_tracking_uri(tracking_uri)
    logging.info(f"MLflow tracking URI set to: {tracking_uri}")
except Exception as e:
    logging.warning(f"Failed to set MLflow tracking URI: {e}")
    logging.info("Using local MLflow tracking instead")

try:
    dagshub.init(repo_owner='lakshya-hidau', repo_name='MLOps-Learning', mlflow=True)
except Exception as e:
    logging.warning(f"DagsHub initialization failed: {e}")
    logging.info("Continuing without DagsHub integration")

try:
    mlflow.set_experiment("Logistic Regression Baseline Model")
except Exception as e:
    logging.warning(f"Failed to set MLflow experiment: {e}")
    logging.info("Will use default experiment")

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting MLflow run for Logistic Regression Baseline Model")

with mlflow.start_run():
    start_time = time.time()

    try:
        logging.info("Logging preprocessing parameters")
        mlflow.log_param('vectorizer', 'Bag of words')
        mlflow.log_param('max_features', 100)
        mlflow.log_param('test_size', 0.25)

        logging.info("Initializing Logistic Regression model")
        model = LogisticRegression(max_iter=1000)

        logging.info("Fitting the model")
        model.fit(X_train, y_train)
        logging.info("Training completed")

        logging.info("Logging model parameters")
        mlflow.log_param("model", "Logistic Regression")

        logging.info("Making predictions")
        y_pred = model.predict(X_test)

        logging.info("Calculating evaluation metrics")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        logging.info("Logging evaluation metrics")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        logging.info("Saving and logging the model")
        mlflow.sklearn.log_model(model, "logistic_regression_model")

        # Log execution time
        end_time = time.time()
        mlflow.log_metric("execution_time_seconds", end_time - start_time)

        # Save and log the notebook
        mlflow.log_artifact("exp1_baseline_model.ipynb")
        logging.info("MLflow run completed successfully")

        # Print the evaluation metrics
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}")

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        raise



Accuracy: 0.68
Precision: 0.6833246753246753
Recall: 0.68
F1 Score: 0.6788495120698511


2026/02/18 00:10:16 INFO mlflow.tracking._tracking_service.client: üèÉ View run blushing-horse-342 at: https://dagshub.com/lakshya-hidau/MLOps-Learning.mlflow/#/experiments/0/runs/7e97d78099484cfab8720159820420ad.
2026/02/18 00:10:16 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dagshub.com/lakshya-hidau/MLOps-Learning.mlflow/#/experiments/0.
