In [1]:
import dagshub
import logging
import os
import optuna
import mlflow.sklearn
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score

In [2]:
df=pd.read_csv('data.csv')
df.head()

Unnamed: 0,review,sentiment
0,film version sandra bernhard one woman broadwa...,0
1,switched cable whim treated quite surprise alt...,1
2,plot film contains hole could drive massive tr...,0
3,amusing humor fall flat decent acting quite at...,0
4,say movie terrible good two day earlier watche...,0


In [3]:
dagshub.init(repo_owner='itsalok2', repo_name='nlp_end_to_end', mlflow=True)
mlflow.set_experiment('xgboost with optuna')

<Experiment: artifact_location='mlflow-artifacts:/6941837213f54b508e4a8521acf208ac', creation_time=1759994312127, experiment_id='1', last_update_time=1759994312127, lifecycle_stage='active', name='xgboost with optuna', tags={}>

In [None]:
x = df['review']
y = df['sentiment']

# 2️⃣ Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# 3️⃣ Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=500,  # remove common stopwords
    ngram_range=(1,2)      # consider unigrams and bigrams
)

# 4️⃣ Fit on training data and transform both
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf  = tfidf.transform(x_test)


# Enable MLflow autolog for XGBoost
mlflow.xgboost.autolog()

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def objective(trial):
    # Child run for each trial
    with mlflow.start_run(run_name=f"Trial_{trial.number}", nested=True):
        logging.info(f"Starting trial {trial.number}...")

        # Suggest hyperparameters including num_boost_round
        param = {
            "verbosity": 0,
            "objective": "binary:logistic",
            "eval_metric": "error",  # 1 - accuracy
            "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
            "max_depth": trial.suggest_int("max_depth", 5, 15),
            "eta": trial.suggest_float("eta", 0.01, 0.3, log=True),
            "gamma": trial.suggest_float("gamma", 1e-8, 10.0, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
        }
        num_boost_round = trial.suggest_int("num_boost_round", 300, 1500)

        # Convert data to DMatrix
        dtrain = xgb.DMatrix(x_train_tfidf, label=y_train)
        dtest = xgb.DMatrix(x_test_tfidf, label=y_test)

        # Train with early stopping on validation accuracy
        bst = xgb.train(
            param,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dtrain,'training'),(dtest, "validation")],
            early_stopping_rounds=10,
            verbose_eval=True,
        )

        # Predict
        preds = bst.predict(dtest)
        pred_labels = [1 if p > 0.5 else 0 for p in preds]

        # Compute metrics
        accuracy = accuracy_score(y_test, pred_labels)
        precision = precision_score(y_test, pred_labels)
        recall = recall_score(y_test, pred_labels)
        f1 = f1_score(y_test, pred_labels)

        # Log metrics explicitly
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

    return accuracy  # Optuna maximizes accuracy

# Parent run
with mlflow.start_run(run_name="XGBoost_Hyperparameter_Tuning_Parent_500fea"):
    logging.info("Starting Optuna hyperparameter tuning...")
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=20)

    logging.info("Hyperparameter tuning complete.")
    logging.info(f"Best trial parameters: {study.best_trial.params}")
    logging.info(f"Best accuracy: {study.best_value}")


In [None]:
x=df['review']
y=df['sentiment']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

tfidf=TfidfVectorizer(max_features=500,ngram_range=(1,2))
x_train_tfidf=tfidf.fit_transform(x_train)
x_test_tfidf=tfidf.transform(x_test)

mlflow.xgboost.autolog()

logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(levelname)s - %(message)s")

def objective(trial):
    with mlflow.start_run(run_name=f"Trial_{trial.number}",nested=True):
        logging.info(f"starting trial {trial.number}...")

        params={
            'verbosity':1,
            'objective':'binary:logistic',
            'eval_metric':'error',
            'device':'cuda',
            'tree_method': 'hist',  
            'booster':'dart',
            'lambda':trial.suggest_float('lambda',1e-3,1.0,log=True),
            'alpha':trial.suggest_float('alpha',1e-3,2.0,log=True),
            'max_depth':trial.suggest_int('max_depth',5,15),
            'eta':trial.suggest_float('eta',0.01, 0.3, log=True),
            'gamma':trial.suggest_float('gamma',1e-3, 1, log=True),
            'subsample':trial.suggest_float('subsample',0.5,1.0),
            'colsample_bytree':trial.suggest_float('colsample_bytree',0.5,0.8)
        }

        num_boost_round=trial.suggest_int('num_boost_round',300,1400)

        dtrain=xgb.DMatrix(x_train_tfidf,label=y_train)
        dtest=xgb.DMatrix(x_test_tfidf,label=y_test)

        bst=xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dtrain,'training'),(dtest,'testing')],
            early_stopping_rounds=10,
            verbose_eval=50
        )

        preds=bst.predict(dtest)
        pred_labels=[1 if p>0.5 else 0 for p in preds]

        accuracy=accuracy_score(y_test,pred_labels)
        precision = precision_score(y_test, pred_labels)
        recall = recall_score(y_test, pred_labels)
        f1 = f1_score(y_test, pred_labels)

        # Log metrics explicitly
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

        mlflow.log_metrics({
            'accuracy':accuracy,
            'precision':precision,
            'f1_score':f1,
            'recall':recall
        })

    return accuracy

with mlflow.start_run(run_name="XGBoost_Hyperparameter_Tuning_Parent_500fea"):
    logging.info('starting optuna hyperparameter tuning...')
    study=optuna.create_study(direction='maximize')
    study.optimize(objective,n_trials=30)

    logging.info("Hyperparameter tuning complete.")
    logging.info(f"Best trial parameters: {study.best_trial.params}")
    logging.info(f"Best accuracy: {study.best_value}")

# using the word2vec embeddings

In [None]:
# simple_preprocess function:
                                        # Lowercases text – all letters become lowercase.
                                        # Removes punctuation and special characters – keeps only alphabetic tokens.
                                        # Tokenizes the text – splits the text into a list of words.
                                        # Optional length filtering – you can keep words only between a min_len and max_len.
import gensim
from gensim.models import Word2Vec,KeyedVectors
from gensim.utils import simple_preprocess

model=KeyedVectors.load_word2vec_format('/home/alok_kumar/kubernetes/nlp_end_to_end/notebooks/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin',binary=True)


In [None]:
def get_emb(text,model):
    tokens=simple_preprocess(text)
    valid_tokens=[token for token in tokens if token in model.key_to_index]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    
    embeddings=np.mean([model[token] for token in valid_tokens],axis=0)
    return embeddings

x_embeddings=np.array([get_emb(text=text,model=model) for text in df['review']])
y=df['sentiment'].values

In [None]:
emd_df=pd.DataFrame(x_embeddings,columns=[f"fea_{i}" for i in range(300)])
emd_df['sentiment']=y

x=emd_df.iloc[:,:-1]
y=emd_df.iloc[:,-1]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

mlflow.set_experiment('xgboost with word2vec(not with error)')
mlflow.xgboost.autolog()

logging.basicConfig(level=logging.INFO,format="%(asctime)s - %(levelname)s - %(message)s")

def objective(trial):
    with mlflow.start_run(run_name=f"trial_{trial.number}",nested=True):
        logging.info(f"starting trial {trial.number}...")

        params={
            'verbosity':1,
            'objective':'binary:logistic',
            'eval_metric':'error',
            'device':'cuda',
            'tree_method':'hist',
            'booster':'dart',
            'lambda':trial.suggest_float('lambda',1e-3,1.0,log=True),
            'alpha':trial.suggest_float('alpha',1e-3,2.0,log=True),
            'max_depth':trial.suggest_int('max_depth',5,15),
            'eta':trial.suggest_float('eta',0.01,0.3,log=True),
            'gamma':trial.suggest_float('gamma',1e-3, 1, log=True),
            'subsample':trial.suggest_float('subsample',0.5,1.0),
            'colsample_bytree':trial.suggest_float('colsample_bytree',0.5,0.8)
        }

        num_boost_round=trial.suggest_int('num_boos_round',300,1400)

        dtrain=xgb.DMatrix(x_train,label=y_train)
        dtest=xgb.DMatrix(x_test,label=y_test)

        bst=xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dtrain,'training'),(dtest,'testing')],
            early_stopping_rounds=10,
            verbose_eval=25
        )

        preds=bst.predict(dtest)
        pred_labels=[1 if p>0.5 else 0 for p in preds]

        accuracy=accuracy_score(y_test,pred_labels)
        precision = precision_score(y_test, pred_labels)
        recall = recall_score(y_test, pred_labels)
        f1 = f1_score(y_test, pred_labels)

        # Log metrics explicitly
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

        mlflow.log_metrics({
            'accuracy':accuracy,
            'precision':precision,
            'recall':recall,
            'f1_score':f1
        })

    return accuracy

with mlflow.start_run(run_name="XGBoost_Hyperparameter_Tuning_Parent_word2vec"):
    logging.info('starting optuna hyperparameter tuning...')
    study=optuna.create_study(direction='maximize')
    study.optimize(objective,n_trials=25)

    logging.info("Hyperparameter tuning complete.")
    logging.info(f"Best trial parameters: {study.best_trial.params}")
    logging.info(f"Best accuracy: {study.best_value}")


### Using the Keras tuner with the above word2vec embedding to test out the neural network

In [50]:
import gensim
from gensim.models import Word2Vec,KeyedVectors
from gensim.utils import simple_preprocess

model=KeyedVectors.load_word2vec_format('word2vec.vec',binary=True)

def get_emb(text,model):
    tokens=simple_preprocess(text)
    valid_tokens=[token for token in tokens if token in model.key_to_index]
    if not valid_tokens:
        return np.zeros(model.vector_size)
    
    embeddings=np.mean([model[token] for token in valid_tokens],axis=0)
    return embeddings

x_embeddings=np.array([get_emb(text=text,model=model) for text in df['review']])
y=df['sentiment'].values

emd_df=pd.DataFrame(x_embeddings,columns=[f"fea_{i}" for i in range(300)])
emd_df['sentiment']=y

x=emd_df.iloc[:,:-1]
y=emd_df.iloc[:,-1]

In [53]:
emd_df.to_csv('/home/alok_kumar/kubernetes/nlp_end_to_end/data/processed/embedded_data/embedded_dataframe.csv',index=False)

In [4]:
                                    # next time use this
emd_df=pd.read_csv('/home/alok_kumar/kubernetes/nlp_end_to_end/data/processed/embedded_data/embedded_dataframe.csv')
x=emd_df.iloc[:,:-1]
y=emd_df.iloc[:,-1]