# Which vectorization?

In [1]:
!pip install mlflow boto3 awscli

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.38.41-py3-none-any.whl.metadata (6.6 kB)
Collecting awscli
  Downloading awscli-1.40.40-py3-none-any.whl.metadata (11 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Download

In [2]:
!aws configure

AWS Access Key ID [None]: AKIAWTYSLIKW7C7BFZEB
AWS Secret Access Key [None]: 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
Default region name [None]: us-east-1
Default output format [None]: 


In [5]:
import mlflow
mlflow.set_tracking_uri("http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/")

In [6]:
mlflow.set_experiment('Exp 2 - BoW vs TfIdf')

<Experiment: artifact_location='s3://mlflow-bucket-27/372520452379824492', creation_time=1750361911879, experiment_id='372520452379824492', last_update_time=1750361911879, lifecycle_stage='active', name='Exp 2 - BoW vs TfIdf', tags={}>

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [9]:
df = pd.read_csv('/content/reddit_preprocessing.csv').dropna(subset=['clean_comment'])
df.shape

(36664, 2)

In [10]:
# Step 1: function to run the experiment
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
  # Step 2: vectorization
  if vectorizer_type == 'BoW':
    vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
  elif vectorizer_type == 'TfIdf':
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
  else:
    print('Vectorizer type not supported')
    return

  X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42)

  X_train = vectorizer.fit_transform(X_train)
  X_test = vectorizer.transform(X_test)

  # Step 3: Define and train a RF model
  with mlflow.start_run() as run:
    # Set tags for the experiment and run
    mlflow.set_tag('mlflow.runName', f'{vectorizer_name}_{ngram_range}_RandomForest')
    mlflow.set_tag('experiment_type', 'feature_engineering')
    mlflow.set_tag('model_type', 'RandomForestClassifier')

    # Add a description
    mlflow.set_tag('description', f'RandomForest with {vectorizer_name}, ngram_range={ngram_range}, max_features={vectorizer_max_features}')

    # Log vectorizer parameters
    mlflow.log_param('vectorizer_type', vectorizer_type)
    mlflow.log_param('ngram_range', ngram_range)
    mlflow.log_param('vectorizer_max_features', vectorizer_max_features)

    # Log Random Forest parameters
    n_estimators = 200
    max_depth = 15

    mlflow.log_param('n_estimators', n_estimators)
    mlflow.log_param('max_depth', max_depth)

    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    # Step 4: Make predictions and log metrics
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric('accuracy', accuracy)

    # Log classification report
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_rep.items():
      if isinstance(metrics, dict):
        for metric_name, metric_value in metrics.items():
          mlflow.log_metric(f'{label}_{metric_name}', metric_value)

    # Log confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix: {vectorizer_name}, {ngram_range}')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()

    # Log the model
    mlflow.sklearn.log_model(model, f'random_forest_model_{vectorizer_name}_{ngram_range}')

# Step 6: Run experiments for BoW and TF-IDF with different n-grams
ngram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features = 5000

for ngram_range in ngram_ranges:
  run_experiment('BoW', ngram_range, max_features, vectorizer_name='BoW')
  run_experiment('TfIdf', ngram_range, max_features, vectorizer_name='TfIdf')



🏃 View run BoW_(1, 1)_RandomForest at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492/runs/226a11a790364865a0eede90c98545cc
🧪 View experiment at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492




🏃 View run TfIdf_(1, 1)_RandomForest at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492/runs/5d8d09bbf5f542238735a21219f9690a
🧪 View experiment at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492




🏃 View run BoW_(1, 2)_RandomForest at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492/runs/90ce839ff8e7455688ac70d86193b0b4
🧪 View experiment at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492




🏃 View run TfIdf_(1, 2)_RandomForest at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492/runs/3bfe5fa49ba7406997b4b2585c1cabb6
🧪 View experiment at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492




🏃 View run BoW_(1, 3)_RandomForest at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492/runs/690ddd59dfe742acb283ba1b174db03a
🧪 View experiment at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492




🏃 View run TfIdf_(1, 3)_RandomForest at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492/runs/e2a7b78cba524b70ad43c0f4087c1351
🧪 View experiment at: http://ec2-3-82-171-148.compute-1.amazonaws.com:5000/#/experiments/372520452379824492
