In [1]:
!pip install mlflow boto3 awscli optuna imbalanced-learn lightgbm

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting boto3
  Downloading boto3-1.39.7-py3-none-any.whl.metadata (6.6 kB)
Collecting awscli
  Downloading awscli-1.41.7-py3-none-any.whl.metadata (11 kB)
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.58.0-py3-none-any.whl.metadata (39 kB)
Collecti

In [2]:
!aws configure

AWS Access Key ID [None]: AKIAWTYSLIKW7C7BFZEB 
AWS Secret Access Key [None]: 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
Default region name [None]: us-east-1
Default output format [None]: 


In [3]:
import mlflow
mlflow.set_tracking_uri('http://44.203.66.130:5000')

In [4]:
mlflow.set_experiment('LightGBM HP Tuning')

2025/07/16 23:10:34 INFO mlflow.tracking.fluent: Experiment with name 'LightGBM HP Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-bucket-27/336200325892556928', creation_time=1752707434480, experiment_id='336200325892556928', last_update_time=1752707434480, lifecycle_stage='active', name='LightGBM HP Tuning', tags={}>

In [6]:
import pandas as pd

df = pd.read_csv('/content/reddit_preprocessing.csv')
df.shape

(36793, 2)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
import optuna
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt

In [8]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

df = df.dropna(subset=['category'])

In [10]:
# Drop rows with missing values in 'clean_comment' column
df.dropna(subset=['clean_comment'], inplace=True)

# TF-IDF Vectorizer setup
ngram_range = (1,3)
max_features = 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [13]:
# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test, params, trial_number):
    with mlflow.start_run():
        # Log model type and trial number
        mlflow.set_tag("mlflow.runName", f"Trial_{trial_number}_{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Log hyperparameters
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

        return accuracy

In [14]:
# Optuna objective function fgor LightGBM
def objective_lightgbm(trial):
  # Hyperparameter space to explore
  n_estimators = trial.suggest_int('n_estimators', 100, 1000)
  learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
  max_depth = trial.suggest_int('max_depth', 3, 15)
  num_leaves = trial.suggest_int('num_leaves', 20, 150)
  min_child_samples = trial.suggest_int('min_child_samples', 10, 100)
  colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
  subsample = trial.suggest_float('subsample', 0.5, 1.0)
  reg_alpha = trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True) # L1 regularization
  reg_lambda = trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True) # L2 regularization

  # Log trial parameters
  params = {
      'n_estimators': n_estimators,
      'learning_rate': learning_rate,
      'max_depth': max_depth,
      'num_leaves': num_leaves,
      'min_child_samples': min_child_samples,
      'colsample_bytree': colsample_bytree,
      'subsample': subsample,
      'reg_alpha': reg_alpha,
      'reg_lambda': reg_lambda
  }

  # Create LightGBM model
  model = LGBMClassifier(n_estimators=n_estimators,
                         learning_rate=learning_rate,
                         max_depth=max_depth,
                         num_leaves=num_leaves,
                         min_child_samples=min_child_samples,
                         colsample_bytree=colsample_bytree,
                         subsample=subsample,
                         reg_alpha=reg_alpha,
                         reg_lambda=reg_lambda,
                         random_state=42)
  # Log each trial as a separate run in MLflow
  accuracy = log_mlflow("LightGBM", model, X_train, X_test, y_train, y_test, params, trial.number)

  return accuracy

In [15]:
# Run Optuna for LightGBM, log the best model, and plot the importance of each parameter
def run_optuna_experiment():
  study = optuna.create_study(direction='maximize')
  study.optimize(objective_lightgbm, n_trials=100)

  # Get the best parameters
  best_params = study.best_params
  best_model = LGBMClassifier(n_estimators=best_params['n_estimators'],
                              learning_rate=best_params['learning_rate'],
                              max_depth=best_params['max_depth'],
                              num_leaves=best_params['num_leaves'],
                              min_child_samples=best_params['min_child_samples'],
                              colsample_bytree=best_params['colsample_bytree'],
                              subsample=best_params['subsample'],
                              reg_alpha=best_params['reg_alpha'],
                              reg_lambda=best_params['reg_lambda'],
                              random_state=42
                              )

  # Log the best model with MLflow and print the classification report
  log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test, best_params, "Best")

  # Plot parameter importance
  optuna.visualization.plot_param_importances(study).show()

  # Plot optimization history
  optuna.visualization.plot_optimization_history(study).show()

In [16]:
# Run the experiment for LightGBM
run_optuna_experiment()

Output hidden; open in https://colab.research.google.com to view.