## Initiate secrets for OpenAI embeddings and Github
- OpenAI api key is essential for the demo
- github key is optional as pypi does not require api key for metadata

In [1]:

import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")

## Create embeddings for category definitions and sementic context
- both can be modified in settings/category_keywords.toml

In [2]:
from dynaconf import Dynaconf

settings = Dynaconf(settings_files=[
    "settings/category_keywords.toml",
    "settings/component_testset.toml"
])
component_keywords = settings.category_keywords.to_dict()

In [3]:
import pandas as pd
from src.utils import get_kw_embedding_df
import os

if os.path.exists('data/kw_emb_df'):
    kw_emb_df = pd.read_feather("data/kw_emb_df")
else:
    kw_emb_df = get_kw_embedding_df(component_keywords)
    kw_emb_df.to_feather("data/kw_emb_df")



## Core function for sementic ranking and clustering
1. calc rank based on the distance between query and key words of each category
2. hybrid clustering using OPTIC and standard deviation based on the score distances

In [4]:
import numpy as np
import json
from src.model import Model
from src.utils import calc_kw_distance, output_json, get_emb_batch
from typing import List


def run(query: List[str]):
    if isinstance(query, str):
        query = [query]
    emb = get_emb_batch(query)
    score_df = calc_kw_distance(
        emb,
        kw_emb_df
    )
    df = score_df.groupby(['category_name'])['distance'].mean().reset_index().sort_values('distance', ascending=False)
    M = Model()
    df['algo_grouping'] = M.algo_grouping(df['distance'].tolist())
    df['std_grouping'] = M.std_grouping(df['distance'].tolist())
    df['grouping'] = df.apply(M.consolidate_grouping, axis=1)
    o = output_json(df)
    print(o)


## github examples

In [5]:
repo_url="https://github.com/scikit-learn/scikit-learn"

In [6]:
from src.github_helper import GhMetadata

gh = GhMetadata(
    repo_url=repo_url, 
    # owner=owner, 
    # repo=repo, 
    verbose=True
)
readme = gh.get_readme()
topic = gh.get_github_topic()

In [7]:
run(readme)

{
  "winner": [
    "Machine Learning & AI Frameworks"
  ],
  "prediction_cluster_raw": {
    "1": [
      "Machine Learning & AI Frameworks"
    ],
    "2": [
      "Testing & Quality Validation"
    ],
    "3": [
      "Logging & Monitoring"
    ],
    "4": [
      "Configuration Management",
      "Database Interaction",
      "Operating System Interaction",
      "Authentication & Authorization"
    ],
    "5": [
      "File & Data Handling",
      "Web Framework Components"
    ],
    "6": [
      "Network Communication",
      "Cryptography",
      "Memory Management"
    ]
  },
  "prediction_distance_raw": {
    "Machine Learning & AI Frameworks": 0.2428405722771753,
    "Testing & Quality Validation": 0.16932297212452993,
    "Logging & Monitoring": 0.12828668472159763,
    "Configuration Management": 0.10654633356608996,
    "Database Interaction": 0.09074698306559636,
    "Operating System Interaction": 0.08958592405704817,
    "Authentication & Authorization": 0.083741490751

In [None]:
run(topic)

{
  "winner": [
    "Machine Learning & AI Frameworks",
    "Database Interaction",
    "Testing & Quality Validation",
    "Logging & Monitoring",
    "File & Data Handling"
  ],
  "prediction_cluster_raw": {
    "0": [
      "Machine Learning & AI Frameworks",
      "Database Interaction",
      "Testing & Quality Validation",
      "Logging & Monitoring",
      "File & Data Handling"
    ],
    "1": [
      "Operating System Interaction"
    ],
    "2": [
      "Web Framework Components",
      "Cryptography"
    ],
    "3": [
      "Memory Management",
      "Configuration Management",
      "Authentication & Authorization",
      "Network Communication"
    ]
  },
  "prediction_distance_raw": {
    "Machine Learning & AI Frameworks": 0.345456016605427,
    "Database Interaction": 0.30818221621039643,
    "Testing & Quality Validation": 0.2943427296141345,
    "Logging & Monitoring": 0.28661319949573233,
    "File & Data Handling": 0.28106328920898627,
    "Operating System Interac

## pypi examples

In [1]:
from src.pypi_helper import PypiMetadata

In [2]:
pp = PypiMetadata('scikit-learn')
pp_meta = pp.get_output()

In [14]:
run(pp_meta['summary'])


{
  "winner": [
    "Machine Learning & AI Frameworks"
  ],
  "prediction_cluster_raw": {
    "1": [
      "Machine Learning & AI Frameworks"
    ],
    "2": [
      "Database Interaction",
      "Operating System Interaction",
      "Logging & Monitoring",
      "File & Data Handling"
    ],
    "3": [
      "Testing & Quality Validation",
      "Network Communication",
      "Web Framework Components",
      "Configuration Management"
    ],
    "4": [
      "Cryptography",
      "Authentication & Authorization",
      "Memory Management"
    ]
  },
  "prediction_distance_raw": {
    "Machine Learning & AI Frameworks": 0.2881788745667235,
    "Database Interaction": 0.164441953259365,
    "Operating System Interaction": 0.14782518568979633,
    "Logging & Monitoring": 0.14189976873211133,
    "File & Data Handling": 0.13805172347690686,
    "Testing & Quality Validation": 0.1251943102803027,
    "Network Communication": 0.12399576405428482,
    "Web Framework Components": 0.117711490

In [13]:
run(pp_meta['topic'])


{
  "winner": [
    "Testing & Quality Validation",
    "Operating System Interaction"
  ],
  "prediction_cluster_raw": {
    "0": [
      "Testing & Quality Validation",
      "Operating System Interaction"
    ],
    "1": [
      "Database Interaction",
      "Configuration Management"
    ],
    "2": [
      "File & Data Handling",
      "Web Framework Components"
    ],
    "3": [
      "Machine Learning & AI Frameworks",
      "Network Communication",
      "Logging & Monitoring",
      "Authentication & Authorization"
    ],
    "4": [
      "Cryptography",
      "Memory Management"
    ]
  },
  "prediction_distance_raw": {
    "Testing & Quality Validation": 0.2795465695157602,
    "Operating System Interaction": 0.2782623050734585,
    "Database Interaction": 0.25290058306942753,
    "Configuration Management": 0.25231756531915567,
    "File & Data Handling": 0.2449421698503955,
    "Web Framework Components": 0.24349921890645487,
    "Machine Learning & AI Frameworks": 0.23359