In [1]:
# bow vs tfidf

# Import necessary libraries
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import os
import dagshub

In [2]:
reviews = pd.read_csv(r"C:\Users\work\Documents\Kaggle\train\amazon_reviews_train_clean.csv")  # read kaggle cleaned data
reviews.isna().sum()
reviews['text'].fillna("", inplace=True)
reviews.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reviews['text'].fillna("", inplace=True)


label    0
text     0
dtype: int64

In [3]:
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow')
dagshub.init(repo_owner='kunalkaushik1982', repo_name='amazon-product-reviews', mlflow=True)

mlflow.set_experiment("Bow vs TfIdf")

2024/12/29 20:46:00 INFO mlflow.tracking.fluent: Experiment with name 'Bow vs TfIdf' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/69dd8acad52f432d870a1f457a075854', creation_time=1735485361023, experiment_id='2', last_update_time=1735485361023, lifecycle_stage='active', name='Bow vs TfIdf', tags={}>

In [4]:
# Define feature extraction methods
vectorizers = {
    'BoW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

# Define algorithms
algorithms = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'MultinomialNB': MultinomialNB(),
    'XGBoost': XGBClassifier(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [None]:
# Start the parent run
with mlflow.start_run(run_name="All Experiments") as parent_run:
    # Loop through algorithms and feature extraction methods (Child Runs)
    for algo_name, algorithm in algorithms.items():
        for vec_name, vectorizer in vectorizers.items():
            with mlflow.start_run(run_name=f"{algo_name} with {vec_name}", nested=True) as child_run:
                X = vectorizer.fit_transform(reviews['text'])
                y = reviews['label']
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                

                # Log preprocessing parameters
                mlflow.log_param("vectorizer", vec_name)
                mlflow.log_param("algorithm", algo_name)
                mlflow.log_param("test_size", 0.2)
                
                # Model training
                model = algorithm
                if algo_name == 'XGBoost':
                    y_train=y_train-1
                    y_test = y_test - 1
                model.fit(X_train, y_train)
                
                # Log model parameters
                if algo_name == 'LogisticRegression':
                    mlflow.log_param("C", model.C)
                elif algo_name == 'MultinomialNB':
                    mlflow.log_param("alpha", model.alpha)
                elif algo_name == 'XGBoost':
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("learning_rate", model.learning_rate)
                elif algo_name == 'RandomForest':
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("max_depth", model.max_depth)
                elif algo_name == 'GradientBoosting':
                    mlflow.log_param("n_estimators", model.n_estimators)
                    mlflow.log_param("learning_rate", model.learning_rate)
                    mlflow.log_param("max_depth", model.max_depth)
                
                # Model evaluation
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test,y_pred)
                f1 = f1_score(y_test, y_pred)

                if algo_name == 'XGBoost':
                    y_pred=y_pred+1
                    
                
                # Log evaluation metrics
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric("precision", precision)
                mlflow.log_metric("recall", recall)
                mlflow.log_metric("f1_score", f1)
                
                # Log model
                mlflow.sklearn.log_model(model, "model")

                 # Save and log the notebook
                import os
                notebook_path = "exp2_bow_vs_tfidf.ipynb"
                os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
                mlflow.log_artifact(notebook_path)
                                
                # Print the results for verification
                print(f"Algorithm: {algo_name}, Feature Engineering: {vec_name}")
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1 Score: {f1}")



Algorithm: LogisticRegression, Feature Engineering: BoW
Accuracy: 0.9042875
Precision: 0.9069187634312321
Recall: 0.9009114434941169
F1 Score: 0.9039051224664812
🏃 View run LogisticRegression with BoW at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2/runs/595fdaff16cb40c69867217d1d1e31d5
🧪 View experiment at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2




Algorithm: LogisticRegression, Feature Engineering: TF-IDF
Accuracy: 0.9050041666666667
Precision: 0.9065644866907419
Recall: 0.9029433593044233
F1 Score: 0.9047502997572697
🏃 View run LogisticRegression with TF-IDF at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2/runs/20ef588c758d4d4ab62b0fd13658982a
🧪 View experiment at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2




Algorithm: MultinomialNB, Feature Engineering: BoW
Accuracy: 0.8536361111111112
Precision: 0.8434714274526537
Recall: 0.8681923176348612
F1 Score: 0.8556533554684397
🏃 View run MultinomialNB with BoW at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2/runs/40af7ae2250d40759d09e7f9f3fea043
🧪 View experiment at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2




Algorithm: MultinomialNB, Feature Engineering: TF-IDF
Accuracy: 0.8448541666666667
Precision: 0.8340236782005429
Recall: 0.860806817897537
F1 Score: 0.8472036231775026
🏃 View run MultinomialNB with TF-IDF at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2/runs/72b859b0b7bb4299843252e00ca8e813
🧪 View experiment at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2




Algorithm: XGBoost, Feature Engineering: BoW
Accuracy: 0.8674208333333333
Precision: 0.8619918410202005
Recall: 0.875130815204266
F1 Score: 0.8685116388626023
🏃 View run XGBoost with BoW at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2/runs/53a11c34d71d4bacbdceeec584f94c33
🧪 View experiment at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2




Algorithm: XGBoost, Feature Engineering: TF-IDF
Accuracy: 0.8676916666666666
Precision: 0.8606854383151796
Recall: 0.8776152631155254
F1 Score: 0.8690679084291778
🏃 View run XGBoost with TF-IDF at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2/runs/15c12f966d8e440aa5c5cbf0094a04cb
🧪 View experiment at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/2
