In [None]:
import pandas as pd
import mlflow
import optuna
import dagshub
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
dagshub.init(repo_owner='itsalok2', repo_name='nlp_end_to_end', mlflow=True)

In [None]:
# Set MLflow experiment
mlflow.set_experiment("Neural_Network_with_optuna")

# Load data
emd_df = pd.read_csv('/home/alok_kumar/kubernetes/nlp_end_to_end/data/processed/embedded_data/embedded_dataframe.csv')
x = emd_df.iloc[:, :-1].values
y = emd_df.iloc[:, -1].values
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

def objective(trial):
    num_layers = trial.suggest_int('num_layers', 1, 3)
    activ = trial.suggest_categorical('activation', ['relu', 'tanh', 'selu'])
    lr = trial.suggest_float('lr', 1e-4, 1e-1, log=True)

    model = Sequential()
    model.add(layers.Input(shape=(x_train.shape[1],)))
    
    for i in range(num_layers):
        units = trial.suggest_int(f"unit_{i}", 8, 32, step=4)
        model.add(layers.Dense(units=units, activation=activ))
    
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    es = EarlyStopping(monitor='val_accuracy', patience=6, restore_best_weights=True)
    
    # Each trial is a nested run under the parent
    with mlflow.start_run(nested=True):
        mlflow.log_params({
            "num_layers": num_layers,
            "activation": activ,
            "lr": lr,
            **{f"units_{i}": trial.params.get(f'unit_{i}') for i in range(num_layers)}
        })

        history = model.fit(x_train, y_train,
                            epochs=25,
                            validation_data=(x_val, y_val),
                            callbacks=[es],
                            verbose=0)

        y_pred = (model.predict(x_val) > 0.5).astype('int32')

        precision = precision_score(y_val, y_pred)
        accuracy = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)

        mlflow.log_metrics({
            "val_precision": precision,
            "val_recall": recall,
            "val_f1": f1,
            "val_accuracy": accuracy
        })

        return max(history.history['val_accuracy'])

# Start a parent MLflow run
with mlflow.start_run(run_name="optuna_study_parent"):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=25)

    # Log best trial params & value in parent run
    mlflow.log_params(study.best_trial.params)
    mlflow.log_metric("best_val_accuracy", study.best_trial.value)

In [None]:
params={'num_layers': 3,
 'activation': 'relu',
 'lr': 0.044980501630228376,
 'unit_0': 8,
 'unit_1': 8,
 'unit_2': 8}

emd_df = pd.read_csv('/home/alok_kumar/kubernetes/nlp_end_to_end/data/processed/embedded_data/embedded_dataframe.csv')
x = emd_df.iloc[:, :-1].values
y = emd_df.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model=Sequential()

model.add(layers.Input(shape=(x_train.shape[1],)))

for i in range(params['num_layers']):
    model.add(layers.Dense(units=params[f'unit_{i}'],activation=params['activation']))

model.add(layers.Dense(1,activation='sigmoid'))

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=params['lr']),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

es=EarlyStopping(monitor='val_accuracy',patience=8,restore_best_weights=True)

history=model.fit(x_train,y_train,
                  validation_split=0.2,
                  epochs=25,
                  batch_size=32,
                  verbose=1,
                  callbacks=[es])

y_pred=(model.predict(x_test)>0.5).astype('int32').flatten()

print('accuracy',accuracy_score(y_test,y_pred))

### working with stacking classifier

In [None]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Import major classifiers
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier

# -----------------------
# Load and prepare dataset
# -----------------------
emd_df = pd.read_csv('/home/alok_kumar/kubernetes/nlp_end_to_end/data/processed/embedded_data/embedded_dataframe.csv')
x = emd_df.iloc[:, :-1].values
y = emd_df.iloc[:, -1].values
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

# -----------------------
# Define base learners
# -----------------------
base_learners = [
    ('logreg', LogisticRegression(max_iter=500)),
    ('sgd', SGDClassifier(max_iter=1000, tol=1e-3)),
    ('perceptron', Perceptron(max_iter=500)),
    ('ridge', RidgeClassifier()),
    ('passive', PassiveAggressiveClassifier(max_iter=1000)),
    ('gnb', GaussianNB()),
    ('bnb', BernoulliNB()),
    ('knn', KNeighborsClassifier()),
    ('svc', SVC(probability=True, kernel='rbf')),
    ('linsvc', LinearSVC(max_iter=2000)),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('ada', AdaBoostClassifier()),
    ('et', ExtraTreesClassifier()),
    ('bag', BaggingClassifier()),
    ('mlp', MLPClassifier(max_iter=1000))
]

# -----------------------
# Define meta-model (final estimator)
# -----------------------
final_estimator = XGBClassifier(
    n_estimators=1000,
    max_depth=3,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# -----------------------
# Create the stacking classifier
# -----------------------
stack_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=final_estimator,
    cv=5,              # 5-fold cross-validation for meta-model
    n_jobs=-1,         # use all cores
    passthrough=False  # whether to include original features in meta-model
)

# -----------------------
# Train and evaluate
# -----------------------
stack_clf.fit(x_train, y_train)
y_pred = stack_clf.predict(x_val)

# Compute metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')

# Display results
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
