In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import classification_report
import mlflow
import mlflow.sklearn
from onnxmltools import convert_sklearn
from onnxmltools.utils import save_model
import pickle
import re



* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
def preprocessing(review):
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    # Remove HTML tags
    review = re.sub(r'<.*?>', '', review)
    
    # Remove special characters and punctuation
    review = re.sub(r'[^\w\s]', '', review)
    
    # Remove extra whitespaces
    review = ' '.join(review.split())
    
    # Remove mentions and hashtags
    review = re.sub(r'@\w+|#\w+', '', review)
    return review


In [5]:
# Apply preprocessing
df['processed_reviews'] = df['review'].apply(preprocessing)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['processed_reviews'], df['sentiment'], test_size=0.2)

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [6]:
X_train.shape

(40000,)

In [7]:
import joblib
joblib.dump(vectorizer, 'models/fitted_vectorizer.pkl')

['models/fitted_vectorizer.pkl']

# Mlflow

In [8]:
# Define a list of models
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    #"SVM": svm.SVC(),
}

# Function to train and log model
def train_and_log_model(name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        report = classification_report(y_test, predictions, output_dict=True)

        # Log model and params
        mlflow.log_param("model_name", name)
        mlflow.log_metrics({"accuracy": report['accuracy']})
        mlflow.sklearn.log_model(model, "model")

        return report['accuracy']

# Train and log each model
for name, model in models.items():
    accuracy = train_and_log_model(name, model, X_train_vec, X_test_vec, y_train, y_test)
    print(f"Model: {name}, Accuracy: {accuracy}")


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model: Logistic Regression, Accuracy: 0.8894


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model: Naive Bayes, Accuracy: 0.8505


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model: Random Forest, Accuracy: 0.8424




In [9]:
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
