In [20]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
df = pd.read_csv('/Users/harishsundaralingam/myworkspace/sentiment_analysis/model_notebooks/dataset.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [22]:
import mlflow
# Step 2: Set up the MLflow tracking server
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [23]:
# Set or create an experiment
mlflow.set_experiment("XGBoost")

<Experiment: artifact_location='mlflow-artifacts:/309646884127018700', creation_time=1755036041652, experiment_id='309646884127018700', last_update_time=1755036041652, lifecycle_stage='active', name='XGBoost', tags={}>

In [24]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

In [25]:
df = df.dropna(subset=['category'])

In [26]:
ngram_range = (1, 3)  # Trigram setting
max_features = 10000  # Set max_features to 1000 for TF-IDF

In [27]:
# Step 4: Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

In [28]:
# Step 2: Vectorization using TF-IDF, fit on training data only
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)  # Fit on training data
X_test_vec = vectorizer.transform(X_test)  # Transform test data

In [29]:
smote = SMOTE(random_state=42)
X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)

In [40]:
X_train_vec

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 612381 stored elements and shape (37848, 10000)>

In [30]:
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [41]:
def objective(trial):
    params = {
        'objective': 'multi:softprob',
        'num_class': 3,  # Adjust based on your number of classes
        'eval_metric': 'mlogloss',
        'booster': 'gbtree',
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(
        X_train_vec, y_train,
        verbose=False
    )
    preds = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, preds)
    
    # Return negative accuracy since Optuna minimizes
    return -accuracy

In [42]:
study = optuna.create_study(direction='minimize')  # or 'maximize' for accuracy
study.optimize(objective, n_trials=20)  # Adjust n_trials as needed

# Print results
print("Best trial:")
print(f"Value: {study.best_trial.value}")
print("Params:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

[I 2025-08-12 18:27:48,621] A new study created in memory with name: no-name-2d45db36-fbbf-4062-9dbe-5cb562b8059a
[I 2025-08-12 18:28:21,841] Trial 0 finished with value: -0.8300831855993455 and parameters: {'max_depth': 11, 'learning_rate': 0.23163850944424702, 'n_estimators': 929, 'subsample': 0.6935714077952102, 'colsample_bytree': 0.6163141784293505, 'gamma': 2.4793588852520787, 'reg_alpha': 3.402883649301037, 'reg_lambda': 0.5452285824696057}. Best is trial 0 with value: -0.8300831855993455.
[I 2025-08-12 18:28:53,772] Trial 1 finished with value: -0.8558570844129278 and parameters: {'max_depth': 11, 'learning_rate': 0.24568736141295838, 'n_estimators': 608, 'subsample': 0.84118327232876, 'colsample_bytree': 0.7498628507938736, 'gamma': 4.217792442920924, 'reg_alpha': 0.43287235907399635, 'reg_lambda': 0.02812355205783157}. Best is trial 1 with value: -0.8558570844129278.
[I 2025-08-12 18:29:19,664] Trial 2 finished with value: -0.8074457929905905 and parameters: {'max_depth': 9, 

Best trial:
Value: -0.8611755079776353
Params:
  max_depth: 11
  learning_rate: 0.09177185181094655
  n_estimators: 847
  subsample: 0.837623138448082
  colsample_bytree: 0.93132206077289
  gamma: 1.3333881800717446
  reg_alpha: 0.07188612985453702
  reg_lambda: 4.353761449390493


In [43]:
best_params = study.best_params
best_model = xgb.XGBClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)

# Log the best model with MLflow, passing the algo_name as "xgboost"
log_mlflow("XGBoost", best_model, X_train_vec, X_test_vec, y_train, y_test)



🏃 View run XGBoost_SMOTE_TFIDF_Trigrams at: http://127.0.0.1:5000/#/experiments/309646884127018700/runs/b1332395a24541158d0a4aad64a2af26
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/309646884127018700
