In [1]:
import numpy as np
import pandas as pd
#import scikit-learn
import os
import torch
from transformers import BertForSequenceClassification
import torch.nn as nn
import torch.optim as optim
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import mlflow
from mlflow import log_metric, log_param, log_params, log_artifacts
#import tensorflow
#from tensorflow.summary import create_file_writer

In [2]:
from transformers import BertForSequenceClassification

In [3]:
DATA_PATH='/home/vboxuser/mlprojects/data/made_with_ml'
MODEL_NAME = 'bert-base-nli-mean-tokens'

In [4]:
def read_and_process_data(parent_dir,filename):
    '''
    This function will read and prepare text for training
    '''
    df=pd.read_csv(os.path.join(parent_dir,filename),dtype={'title':str,'description':str,'tag':str},index_col=None)
    df['combined']=df['title'] + ' ' + df['description']
    df['combined']=df['combined'].apply(lambda text:text.lower())
    df['combined_with_SEP']=df['title'] + ' [SEP] ' + df['description']
    df['combined_with_SEP']=df['combined_with_SEP'].apply(lambda text:text.lower())
    return df

In [5]:
df_train=read_and_process_data(DATA_PATH,'dataset.csv')
df_valid=read_and_process_data(DATA_PATH,'holdout.csv')

In [6]:
df_train['tag'].unique()

array(['computer-vision', 'graph-learning', 'reinforcement-learning',
       'natural-language-processing', 'mlops', 'time-series'],
      dtype=object)

In [7]:
df_train['tag'].unique()

array(['computer-vision', 'graph-learning', 'reinforcement-learning',
       'natural-language-processing', 'mlops', 'time-series'],
      dtype=object)

In [8]:
def model_init(model_name='sentence-transformers/all-mpnet-base-v2'):
    model = SentenceTransformer(model_name)
    return model

In [9]:
def featurization(text,model):
    embeddings = model.encode(text)
    return embeddings

In [10]:
def label_encoding(self):
    le = preprocessing.LabelEncoder()
    le.fit(self.df['tag'])
    return le

def label_transform(df,le):
    return le.transform(df['tag'])

In [11]:
model=model_init(model_name='sentence-transformers/all-mpnet-base-v2')
df_train['bert_features_combined']=df_train['combined'].apply(lambda x: featurization(x,model))
df_train['bert_features_combined_SEP']=df_train['combined_with_SEP'].apply(lambda x: featurization(x,model))

In [12]:
df_valid['bert_features_combined']=df_valid['combined'].apply(lambda x: featurization(x,model))
df_valid['bert_features_combined_SEP']=df_valid['combined_with_SEP'].apply(lambda x: featurization(x,model))

In [13]:
label_encoder = LabelEncoder()
label_encoder.fit(df_train['tag'])
df_train['label_int'] = label_encoder.transform(df_train['tag'])
df_valid['label_int'] = label_encoder.transform(df_valid['tag'])

In [19]:
le = OneHotEncoder(handle_unknown='ignore')
#le.fit(df_train['label_int'])
le.fit(df_train['label_int'].values.reshape(-1, 1))

In [20]:
label_int_train = df_train['label_int'].values.reshape(-1, 1)
label_int_valid = df_valid['label_int'].values.reshape(-1, 1)

# Create and fit OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
onehot_encoder.fit(label_int_train)

# Transform and convert to DataFrame
y_train = torch.tensor(onehot_encoder.transform(label_int_train).toarray())
y_valid = torch.tensor(onehot_encoder.transform(label_int_valid).toarray())

X_train=torch.tensor(df_train['bert_features_combined'])
X_valid=torch.tensor(df_valid['bert_features_combined'])


In [21]:
train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

valid_dataset = TensorDataset(X_valid, y_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [22]:
#log_dir = "logs"  # Specify the log directory path
#writer = create_file_writer(log_dir)
writer = SummaryWriter()

In [33]:
#
mlflow.set_tracking_uri('file:/home/vboxuser/mlprojects/sample/notebooks/mlflow')
mlflow.set_experiment("ml_topic_classification_exp")

INFO: 'ml_topic_classification_exp' does not exist. Creating a new experiment


In [19]:
with mlflow.start_run() as run:
    run_id = run.info.run_id

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [34]:
from sklearn.metrics import roc_auc_score
def multiclass_auc(data_loader,model):
    true_labels = []
    predicted_probs = []

    # Set the model to evaluation mode
    model.eval()

    # Iterate through the DataLoader
    for inputs, labels in data_loader:
        # Forward pass to get predicted probabilities
        outputs = model(inputs)
        #print('outputs ',outputs.detach().numpy())
        #print('labels',labels.numpy())
        # Convert tensor to NumPy array and append to the list
        true_labels.extend(labels.numpy())
        predicted_probs.extend(outputs.detach().numpy())
    # Calculate AUC for each class
    auc_scores = list()
    for class_index in range(num_classes):  # Replace 'num_classes' with the actual number of classes
        true_class_labels = [1 if label[class_index] == 1 else 0 for label in true_labels]
        #print(true_class_labels)
        #true_class_labels = true_class_labels.tolist()
        class_probs = [prob[class_index] for prob in predicted_probs]
        #print(class_probs)
        auc = roc_auc_score(true_class_labels, class_probs)
        print(auc)
        auc_scores.append(auc)
    return auc_scores

# # Print or use the AUC scores for each class
# for class_index, auc in enumerate(auc_scores):
#     print(f"AUC for class {class_index}: {auc}")

In [38]:
# Define the neural network model
class TwoLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_prob):
        super(TwoLayerNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

class ThreeLayerNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_prob):
        super(ThreeLayerNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_prob)
        
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_prob)
        
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.softmax(x)
        return x


# Initialize the model
hidden_size2 = 512
hidden_size1 = 128
dropout_prob = 0.3
input_size = 768
num_classes = 6
model_2l = TwoLayerNN(input_size, hidden_size1, num_classes, dropout_prob)
model_3l = ThreeLayerNN(input_size, hidden_size1, hidden_size2, num_classes, dropout_prob)

model = model_3l

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()


# Train the model
num_epochs = 20

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    train_loss = 0.0
    for batch_X, batch_y in train_dataloader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)  # Use class labels, not one-hot encoded targets
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()  # Set the model to evaluation mode
    valid_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in valid_dataloader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            valid_loss += loss.item()

    avg_valid_loss = valid_loss / len(valid_dataloader)
    avg_train_loss = train_loss / len(train_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Training Loss: {avg_train_loss:.4f}")
    print(f"Epoch [{epoch+1}/{num_epochs}] - Validation Loss: {avg_valid_loss:.4f}")

    # Log loss values for this epoch
    writer.add_scalar("Loss/train", avg_train_loss, epoch)
    writer.add_scalar("Loss/valid", avg_valid_loss, epoch)
    
with mlflow.start_run():  
    mlflow.log_param("hidden_size1", hidden_size1)
    mlflow.log_param("learning_rate", 0.001)
    mlflow.log_param('no of layers',2)
    mlflow.log_param('dropout',0.3)
    mlflow.log_param("hidden_size2", hidden_size2)
    mlflow.log_metric("train_loss", avg_train_loss)
    mlflow.log_metric("valid_loss", avg_valid_loss)
    class_names=['computer-vision', 'graph-learning', 'reinforcement-learning','natural-language-processing', 'mlops', 'time-series']
    for i,auc in enumerate(multiclass_auc(valid_dataloader,model)): 
        mlflow.log_metric(class_names[i]+"auc", auc)
        
writer.flush()
    

# Make predictions
with torch.no_grad():
    predictions = model(X_valid)
    predicted_classes = torch.argmax(predictions, dim=1)

# You can use the predicted_classes for further analysis or evaluation


Epoch [1/20] - Training Loss: 1.6906
Epoch [1/20] - Validation Loss: 1.5225
Epoch [2/20] - Training Loss: 1.3978
Epoch [2/20] - Validation Loss: 1.2891
Epoch [3/20] - Training Loss: 1.2998
Epoch [3/20] - Validation Loss: 1.2812
Epoch [4/20] - Training Loss: 1.2929
Epoch [4/20] - Validation Loss: 1.2791
Epoch [5/20] - Training Loss: 1.2803
Epoch [5/20] - Validation Loss: 1.2639
Epoch [6/20] - Training Loss: 1.2487
Epoch [6/20] - Validation Loss: 1.2260
Epoch [7/20] - Training Loss: 1.2167
Epoch [7/20] - Validation Loss: 1.2074
Epoch [8/20] - Training Loss: 1.2008
Epoch [8/20] - Validation Loss: 1.2058
Epoch [9/20] - Training Loss: 1.1919
Epoch [9/20] - Validation Loss: 1.1926
Epoch [10/20] - Training Loss: 1.1702
Epoch [10/20] - Validation Loss: 1.1729
Epoch [11/20] - Training Loss: 1.1604
Epoch [11/20] - Validation Loss: 1.1620
Epoch [12/20] - Training Loss: 1.1429
Epoch [12/20] - Validation Loss: 1.1636
Epoch [13/20] - Training Loss: 1.1409
Epoch [13/20] - Validation Loss: 1.1631
Epoc

In [39]:
writer.close()

In [40]:
model_path = "mode_with_3layer"
torch.save(model, model_path)

# Log the model as an artifact
#mlflow.pytorch.log_model(model, artifact_path="model")

In [28]:
model=torch.load('/home/vboxuser/mlprojects/sample/notebooks/model.pth')

AttributeError: Can't get attribute 'TwoLayerNN' on <module '__main__'>

In [None]:
mlflow.log_artifact("model.json", artifact_path="model_architecture")
mlflow.end_run()

In [36]:
!tensorboard --logdir='./runs' --port=6006

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.12.1 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [None]:
# Inside the training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt", max_length=max_length)
        labels = batch['label']
        
        with torch.no_grad():
            bert_outputs = pretrained_model(**inputs).last_hidden_state
        
        cls_token_representation = bert_outputs[:, 0, :]
        logits = custom_classifier(cls_token_representation)  # Pass [CLS] token's representation
        
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
