In [1]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [2]:
reviews = pd.read_csv(r"C:\Users\work\Documents\Kaggle\train\amazon_reviews_train_clean.csv")  # read kaggle cleaned data
reviews.isna().sum()
reviews['text'].fillna("", inplace=True)
reviews.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reviews['text'].fillna("", inplace=True)


label    0
text     0
dtype: int64

In [3]:
reviews['label'].value_counts()

label
2    1800000
1    1800000
Name: count, dtype: int64

In [5]:
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(reviews['text'])
y = reviews['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow')
dagshub.init(repo_owner='kunalkaushik1982', repo_name='amazon-product-reviews', mlflow=True)

mlflow.set_experiment("Logistic Regression Baseline")

2024/12/29 20:38:44 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/81969726bab446de91a85204c7a8bb75', creation_time=1735484924728, experiment_id='1', last_update_time=1735484924728, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [8]:
with mlflow.start_run():
    
    # Log preprocessing parameters
    mlflow.log_param("vectorizer", "Bag of Words")
    mlflow.log_param("num_features", 2000)
    mlflow.log_param("test_size", 0.2)
    
    # Model building and training
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # Log model parameters
    mlflow.log_param("model", "Logistic Regression")
    
    # Model evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Log evaluation metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Log model
    mlflow.sklearn.log_model(model, "model")

    # Save and log the notebook
    import os
    notebook_path = "exp1_baseline_model.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)
    
    # Print the results for verification
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")



Accuracy: 0.8718527777777778
Precision: 0.876896154290529
Recall: 0.8649623775916655
F1 Score: 0.8708883857479899
üèÉ View run polite-newt-702 at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/1/runs/5eec78fbb53a431bbc2500ad5f33bc25
üß™ View experiment at: https://dagshub.com/kunalkaushik1982/amazon-product-reviews.mlflow/#/experiments/1
