### From a File


In [None]:
from main import create_taxonomy, add_categories

filename = "cais_data.csv"
brand = "CAIS"

taxonomy, df, samples = create_taxonomy(filename,
                                        text_column = "keyword",
                                        search_volume_column = "search_volume",
                                        platform = "openai", # "palm" or "openai"
                                        days = 30,
                                        S=500,
                                        ngram_range = (1, 6),
                                        min_df = 2,
                                        brand = None)

df = add_categories(taxonomy, df, brand) 

df.to_csv("cais_taxonomy.csv", index=False)

df.head()

### From a GSC Account

In [None]:
from main import create_taxonomy

brand = "Green Group"

taxonomy, df, samples = create_taxonomy("sc-domain:greengroupcompanies.com",
                                        text_column = None,
                                        search_volume_column = None,
                                        platform = "openai", # "palm" or "openai"
                                        days = 30,
                                        S=100,
                                        ngram_range = (1, 6),
                                        min_df = 2,
                                        brand = None,
                                        limit_queries = 5)

df = add_categories(taxonomy, df, brand) 

df.to_csv("greengroupcompanies_taxonomy.csv", index=False)

df.head()

In [9]:
from lib.api import openai
import settings

from tenacity import retry, wait_random_exponential, stop_after_attempt


@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]


embedding = get_embedding("Your text goes here", model="text-embedding-ada-002")
print(len(embedding))

1536


In [24]:
_ = [print(i.id) for i in openai.Model.list().data if 'embed' in i.id]

text-embedding-ada-002


In [13]:
dir(openai.Model)

['OBJECT_NAME',
 '_DeletableAPIResource__prepare_delete',
 '_ListableAPIResource__prepare_list_requestor',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_astatic_request',
 '_get_api_type_and_version',
 '_static_request',
 'adelete',
 'alist',
 'api_base',
 'api_base_override',
 'api_prefix',
 'arefresh',
 'arequest',
 'aretrieve',
 'auto_paging_iter',
 'azure_api_prefix',
 'azure_deployments_prefix',
 'class_url',
 'clear',
 'construct_from',

In [5]:
list(palm.list_models())

[Model(name='models/chat-bison-001', base_model_id='', version='001', display_name='Chat Bison', description='Chat-optimized generative language model.', input_token_limit=4096, output_token_limit=1024, supported_generation_methods=['generateMessage'], temperature=0.25, top_p=0.95, top_k=40),
 Model(name='models/text-bison-001', base_model_id='', version='001', display_name='Text Bison', description='Model targeted for text generation.', input_token_limit=8196, output_token_limit=1024, supported_generation_methods=['generateText'], temperature=0.7, top_p=0.95, top_k=40),
 Model(name='models/embedding-gecko-001', base_model_id='', version='001', display_name='Embedding Gecko', description='Obtain a distributed representation of a text.', input_token_limit=1024, output_token_limit=1, supported_generation_methods=['embedText'], temperature=None, top_p=None, top_k=None)]

### Add clustered taxonomy

In [None]:
from lib.clustering import ClusterTopics

queries = df['original_query'].tolist()

model = ClusterTopics(
        min_cluster_size =  3,
        min_samples = None,
        reduction_dims = 5,
        cluster_model = "agglomerative",
        cluster_categories = taxonomy,
        use_elbow = True,
        keep_outliers = False,
        n_jobs = 3,
    )


labels, text_labels = model.fit(queries)
label_lookup = {query: label for query, label in zip(queries, text_labels)}
df['clustered_taxonomy'] = df['original_query'].map(label_lookup)

df.to_csv("cais_taxonomy_clustered.csv", index=False)

df.head()