In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer, BertForMaskedLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load the Dataset
data = pd.read_csv('ai_prompts.csv')

data

Unnamed: 0,Prompts,Path_Item,Call
0,Gets a indexed search result of Manufacturing ...,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/search,get
1,Gets a Manufacturing Item,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID},get
2,Modifies the Manufacturing Item attributes,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID},patch
3,Delete the Manufacturing Item,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID},delete
4,"Get scoped, partial scoped and resulting manuf...",/resources/v1/modeler/dsmfg/invoke/dsmfg:getMf...,post
5,Creates Manufacturing Items,/resources/v1/modeler/dsmfg/dsmfg:MfgItem,post
6,Locate or find set of Manufacturing Items dsmf...,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/locate,post
7,Bulk reconnect of scope and resulting products...,/resources/v1/modeler/dsmfg/invoke/dsmfg:recon...,post
8,Expand specified Manufacturing Item dsmfg:MfgI...,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID}...,post
9,Retrieve Manufacturing Item realized changes w...,/resources/v1/modeler/dsmfg/invoke/dsmfg:getRe...,post


In [2]:
# Step 2: Preprocess the Data
lemmatizer = WordNetLemmatizer()


def preprocess(prompt):
    tokens = word_tokenize(prompt.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)  # Join tokens into a string
data['Tokens'] = data['Prompts'].apply(preprocess)

data

Unnamed: 0,Prompts,Path_Item,Call,Tokens
0,Gets a indexed search result of Manufacturing ...,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/search,get,get a indexed search result of manufacturing item
1,Gets a Manufacturing Item,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID},get,get a manufacturing item
2,Modifies the Manufacturing Item attributes,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID},patch,modifies the manufacturing item attribute
3,Delete the Manufacturing Item,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID},delete,delete the manufacturing item
4,"Get scoped, partial scoped and resulting manuf...",/resources/v1/modeler/dsmfg/invoke/dsmfg:getMf...,post,"get scoped , partial scoped and resulting manu..."
5,Creates Manufacturing Items,/resources/v1/modeler/dsmfg/dsmfg:MfgItem,post,creates manufacturing item
6,Locate or find set of Manufacturing Items dsmf...,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/locate,post,locate or find set of manufacturing item dsmfg...
7,Bulk reconnect of scope and resulting products...,/resources/v1/modeler/dsmfg/invoke/dsmfg:recon...,post,bulk reconnect of scope and resulting product ...
8,Expand specified Manufacturing Item dsmfg:MfgI...,/resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID}...,post,expand specified manufacturing item dsmfg : mf...
9,Retrieve Manufacturing Item realized changes w...,/resources/v1/modeler/dsmfg/invoke/dsmfg:getRe...,post,retrieve manufacturing item realized change wi...


In [3]:


# Prepare Data for Training
train_data = [(prompt, path) for prompt, path in zip(data['Tokens'], data['Path_Item'])]

# Initialize and Train the Language Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Initialize sentence transformer model
sentence_transformer_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Transform prompts into embeddings
prompt_embeddings = sentence_transformer_model.encode(data['Tokens'])

In [5]:
# Tokenize the data
tokenized_data = tokenizer([prompt for prompt, _ in train_data], return_tensors='pt', padding="max_length", truncation=True)
tokenized_data


{'input_ids': tensor([[  101,  2131,  1037,  ...,     0,     0,     0],
        [  101,  2131,  1037,  ...,     0,     0,     0],
        [  101, 16913, 14144,  ...,     0,     0,     0],
        ...,
        [  101,  2326,  2000,  ...,     0,     0,     0],
        [  101,  2326,  2000,  ...,     0,     0,     0],
        [  101,  2326,  2000,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [6]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.tokenized_data.items()}

# Create custom dataset
custom_dataset = CustomDataset(tokenized_data)
custom_dataset

<__main__.CustomDataset at 0x21da2124a40>

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
data_collator

DataCollatorForLanguageModeling(tokenizer=BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False,

In [8]:
# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
    output_dir='./output'
)

In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Initialize sentence transformer model
sentence_transformer_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Transform prompts into embeddings
prompt_embeddings = sentence_transformer_model.encode(data['Tokens'])

In [12]:
# SentenceTransformer is a class from the Sentence Transformers library used for generating sentence embeddings.
# The model is initialized with the 'paraphrase-distilroberta-base-v1' pre-trained model. 
# This model is trained on a large dataset to generate embeddings that capture semantic similarities between sentences.
# encode method of the sentence_transformer_model is used to encode the preprocessed prompts into embeddings.
# The encode method takes a list of sentences as input and returns their embeddings.

In [11]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=custom_dataset
)

# Train the model
trainer.train()

# Test the Trained Model
def generate_path(prompt):
    # Encode prompt into embedding
    prompt_embedding = sentence_transformer_model.encode([prompt])
    
    # Compute similarity scores between prompt and all prompts in dataset
    similarities = cosine_similarity(prompt_embedding, prompt_embeddings)[0]
    
    # Find index of most similar prompt
    most_similar_index = similarities.argmax()
    
    # Get corresponding Path_Item
    predicted_path_item = data.loc[most_similar_index, 'Path_Item']
    
    return predicted_path_item

# Test the model
prompt = "Modifies the Manufacturing Item attributes	"
generated_path = generate_path(prompt)
print("Generated Path:", generated_path)

  return {key: torch.tensor(val[idx]) for key, val in self.tokenized_data.items()}


Step,Training Loss


Generated Path: /resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID}


In [14]:
# Test the model
prompt = "Service to detach a Scope Engineering Item from a Manufacturing Item reference"
generated_path = generate_path(prompt)
print("Generated Path:", generated_path)

Generated Path: /resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID}/dsmfg:ScopeEngItem/detach


In [21]:
# Test the model
prompt = "Service to join assignment filters (list of engineering occurrences) to a Manufacturing Item reference"
generated_path = generate_path(prompt)
print("Generated Path:", generated_path)

Generated Path: /resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID}/dsmfg:AssignmentFilter/attach


In [27]:
# Test the model
prompt = "Service to merge list of engineering occurrences to a Manufacturing thing reference"
generated_path = generate_path(prompt)
print("Generated Path:", generated_path)

Generated Path: /resources/v1/modeler/dsmfg/dsmfg:MfgItem/{ID}/dsmfg:AssignmentFilter/attach
