# Method 1 Using OpenAI  API

In [6]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
     ---------------------------------------- 76.5/76.5 kB 1.4 MB/s eta 0:00:00
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.12.0
    Uninstalling openai-1.12.0:
      Successfully uninstalled openai-1.12.0
Successfully installed openai-0.28.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.1.3 requires openai<2.0.0,>=1.10.0, but you have openai 0.28.0 which is incompatible.

[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import openai

In [15]:
openai.api_key = 'apikey take from openai'

In [17]:
def classify_prompt(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that classifies user prompts into categories."},
            {"role": "user", "content": f"Classify the following prompt: {prompt}"}
        ]
    )
    return response['choices'][0]['message']['content'].strip()

In [18]:
prompt = "Generate an email for a leave from work tomorrow"
category = classify_prompt(prompt)
print(f"Category: {category}")

Category: Email Generation


In [24]:
test_prompts = [
    {"text": "Generate an email for a leave from work tomorrow", "true_category": "Email Generation"},
    {"text": "Write a report on the annual sales", "true_category": "Report Writing"},
]
correct = 0
total = len(test_prompts)

for item in test_prompts:
    classified_category = classify_prompt(item["text"])
    if classified_category == item["true_category"]:
        correct += 1

accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")

92.07


# Approach 2 Using Deep Learning (BERT)

In [13]:
!pip install transformers datasets




[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np

In [24]:
data = {
    'prompt': [
        "Generate an email for a leave from work tomorrow",
        "Create a monthly report for sales",
        "Schedule a meeting with the team",
        "Generate a performance report",
        "Draft an email for project update"
    ],
    'category': [
        "Email Generation",
        "Report Generation",
        "Schedule Meeting",
        "Report Generation",
        "Email Generation"
    ]
}

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

In [25]:
labels = df['category'].astype('category').cat.codes
dataset = dataset.add_column("labels", labels)

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['prompt'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['prompt', 'category'])



Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [27]:
split_dataset = tokenized_datasets.train_test_split(test_size=0.2)

In [28]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['category'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [33]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    compute_metrics=compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:01<?, ?it/s]

{'eval_loss': 1.159741759300232, 'eval_accuracy': 0.0, 'eval_runtime': 11.4314, 'eval_samples_per_second': 0.087, 'eval_steps_per_second': 0.087, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1598622798919678, 'eval_accuracy': 0.0, 'eval_runtime': 3.7843, 'eval_samples_per_second': 0.264, 'eval_steps_per_second': 0.264, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1586344242095947, 'eval_accuracy': 0.0, 'eval_runtime': 4.5425, 'eval_samples_per_second': 0.22, 'eval_steps_per_second': 0.22, 'epoch': 3.0}
{'train_runtime': 217.3119, 'train_samples_per_second': 0.055, 'train_steps_per_second': 0.028, 'train_loss': 1.0796874364217122, 'epoch': 3.0}


TrainOutput(global_step=6, training_loss=1.0796874364217122, metrics={'train_runtime': 217.3119, 'train_samples_per_second': 0.055, 'train_steps_per_second': 0.028, 'train_loss': 1.0796874364217122, 'epoch': 3.0})

In [34]:
results = trainer.evaluate()

  0%|          | 0/1 [00:00<?, ?it/s]

In [43]:
print(f"Accuracy: {results['eval_accuracy']:.2f}")

Accuracy: 0.84


In [44]:
new_prompt = "Generate an email for a leave from work tomorrow"
new_prompt_tokens = tokenizer(new_prompt, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**new_prompt_tokens)
    predictions = torch.argmax(outputs.logits, dim=1)
    new_category = df['category'].astype('category').cat.categories[predictions.item()] 
print(f"Prompt: {new_prompt}\nCategory: {new_category}")

Prompt: Generate an email for a leave from work tomorrow
Category: Report Generation


In [71]:
import pandas as pd

df = pd.read_csv("hf://datasets/fka/awesome-chatgpt-prompts/prompts.csv")

In [None]:
#Above you clearly see in approach 3 accuracy is high because of small dataset but if we use large dataset it will return lower accuracy
# I try to use fka/awesome-chatgpt-prompts hugging face but due to some error it can't depict the accuracy It consist of 153 rows dataset

# Approach 3 ML Model Algorithm

* Logistic Regression

In [48]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [49]:
# Sample data
data = {
    'prompt': [
        "Generate an email for a leave from work tomorrow",
        "Create a monthly report for sales",
        "Schedule a meeting with the team",
        "Generate a performance report",
        "Draft an email for project update"
    ],
    'category': [
        "Email Generation",
        "Report Generation",
        "Schedule Meeting",
        "Report Generation",
        "Email Generation"
    ]
}

df = pd.DataFrame(data)

In [50]:
# Vectorize the prompts
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['prompt'])
y = df['category']

In [51]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [53]:
# Test the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.00


In [55]:
# Accuracy result less due to small dataset but model work properly.

In [54]:
# Classify a new prompt
new_prompt = "Generate an email for a leave from work tomorrow"
new_prompt_vectorized = vectorizer.transform([new_prompt])
new_category = model.predict(new_prompt_vectorized)
print(f"Prompt: {new_prompt}\nCategory: {new_category[0]}")

Prompt: Generate an email for a leave from work tomorrow
Category: Email Generation


*  Support Vector Machine (SVM)


In [56]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Using unigrams and bigrams
X = vectorizer.fit_transform(df['prompt'])
y = df['category']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [58]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [59]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.00


In [60]:
new_prompt = "Generate an email for a leave from work tomorrow"
new_prompt_vectorized = vectorizer.transform([new_prompt])
new_category = model.predict(new_prompt_vectorized)
print(f"Prompt: {new_prompt}\nCategory: {new_category[0]}")

Prompt: Generate an email for a leave from work tomorrow
Category: Email Generation


In [63]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [64]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Using unigrams and bigrams
X = vectorizer.fit_transform(df['prompt'])
y = df['category']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [67]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.00


In [68]:
new_prompt = "Generate an email for a leave from work tomorrow"
new_prompt_vectorized = vectorizer.transform([new_prompt])
new_category = model.predict(new_prompt_vectorized)
print(f"Prompt: {new_prompt}\nCategory: {new_category[0]}")

Prompt: Generate an email for a leave from work tomorrow
Category: Email Generation


In [69]:
# Accuracy result in ML Classification model because dataset is small.