In [1]:
%load_ext autoreload
from dotenv import load_dotenv

load_dotenv('../.env')
%autoreload 2

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from skllm.config import SKLLMConfig
SKLLMConfig.set_openai_key(os.getenv('OPENAI_API_KEY'))
SKLLMConfig.set_openai_org(os.getenv('OPENAPI_ORGANIZATION_ID'))
from tqdm import tqdm
tqdm.pandas()
from tqdm import tqdm
from src.utils import readFile
from bertopic.representation import OpenAI as BertOpenAI
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from pydantic import BaseModel
from openai import OpenAI
import instructor

# Pyandantic

In [10]:
inputData = readFile('chatGPT input.json')
client = instructor.patch(OpenAI())
    
class Measurement(BaseModel):
    description: str
    domain: str
    subdomain: str
    
def send_request(message_data, model = "gpt-4-turbo"):
    return client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": 
                   "Take on the persona of a data analyst who is proficient in interpreting JSON objects and extracting meaningful insights from them. "
                   "The user will provide JSON objects representing measurement data from a smart city IoT device."     
            },
            {
                "role": "user",
                "content": ""
                         'Ignore device specific information and concisely summarise what kind of data is being sent. '
                           'Don\'t use references to time intervals such as "hourly", "daily" and "monthly". '
                           'Avoid generic terms like "IoT" and "smart city". '
                           'Also provide example of a smart city domain this device belongs to. '
                           f"{message_data} "
            }
        ],
        max_retries=3,
        response_model=Measurement
    )

# gpt-4 Turbo

In [13]:
from src.utils import tqdmFormat

result_v3 = {}
failedRequests = []

for deviceId, data in tqdm(inputData.items(), desc="requesting data for gpt-4-turbo", bar_format=tqdmFormat):
    try:
        response = send_request(data, model = "gpt-4-turbo")
        result_v3[deviceId] = {
            'name': data['device'],
            'domain': response.domain,
            'subdomain': response.subdomain,
            'description': response.description,
            'input': data,
            'id': deviceId
        }
    except Exception as e:
        print(e)
        failedRequests.append(deviceId)

requesting data for gpt-4-turbo: 100%|██████████| 14307/14307 [time elapsed: 35:50:24]


# Generate embeddings

In [71]:
client = OpenAI()
def get_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = text, model=model).data[0].embedding

In [72]:
for device in tqdm(result_v3.values(), desc="getting embeddings", bar_format=tqdmFormat):
    device['embeddings'] = get_embedding(f"{device['domain']} {device['subdomain']} {device['description']}")

getting embeddings: 100%|██████████| 14307/14307 [time elapsed: 4:55:28]


In [12]:
import openai
from bertopic.backend import OpenAIBackend

client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
embedding_model = OpenAIBackend(client, "text-embedding-3-small")

In [18]:
lastMonth = readFile("telia/measurements/total/c8y_measurements 2024-03-01 - 2024-04-01.json")

In [19]:
lastMonthMapping = set(device['deviceId'] for device in lastMonth if device['total']['count'] > 0)
docs = [f"{item['domain']} {item['subdomain']} {item['description']}".lower() for key, item in result_v3.items() if key in lastMonthMapping]
customEmbeddings = [item['embeddings'] for key, item in result_v3.items() if key in lastMonthMapping]

In [55]:
BertTopicPrompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = BertOpenAI(client, model="gpt-4-turbo", exponential_backoff=True, chat=True, prompt=BertTopicPrompt)

representation_model = {
    "OpenAI": openai_model,
}

class CustomEmbedder(BaseEmbedder):    
    def __init__(self, embedding_model):
        super().__init__()
        self.embedding_model = embedding_model

    def embed(self, documents, verbose=False):
        return customEmbeddings

topic_model = BERTopic(representation_model=representation_model, nr_topics="auto")
topics, probs = topic_model.fit_transform(docs)
hierarchical_topics = topic_model.hierarchical_topics(docs)

100%|██████████| 20/20 [00:00<00:00, 465.32it/s]


In [88]:
topic_model.visualize_hierarchy(title='')

In [75]:
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
chatgpt_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(chatgpt_topic_labels)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=chatgpt_topic_labels, title='', width=600)