In [8]:

import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")

In [9]:
from dynaconf import Dynaconf

settings = Dynaconf(settings_files=[
    "settings/category_keywords.toml",
    "settings/component_testset.toml"
])
component_keywords = settings.category_keywords.to_dict()

In [10]:
import pandas as pd
from openai import OpenAI
from src.utils import TokenUtils
from typing import List

def get_emb_batch(batch_text: List[str], emb_size: int=512) -> List[List[float]]:
    """Generate embeddings for a batch of text strings."""

    try:
        response = OpenAI().embeddings.create(
            input=[TokenUtils(input_text).trim() for input_text in batch_text],
            model='text-embedding-3-small'
        )
        return [data.embedding[:emb_size] for data in response.data]

    except Exception as e:
        raise RuntimeError(f"Failed to get embeddings: {e}")    


def get_kw_embedding_df(kw: dict) -> pd.DataFrame:
    """convert defined categories and their keywords to df with embeddings"""

    kw_df = pd.DataFrame(
        [(k, v) for k, values in component_keywords.items() for v in values], 
        columns=["category_name", "keywords"]
    )

    # calculate emb
    kw_df['embeddings'] = get_emb_batch(kw_df["keywords"].values)

    return kw_df

In [11]:
# kw_emb_df = get_embedding(component_keywords)
# kw_emb_df.to_feather("data/kw_emb_df")
kw_emb_df = pd.read_feather("data/kw_emb_df")

# github

In [12]:
owner = 'scikit-learn'
repo = 'scikit-learn'

In [13]:
from src.github_helper import GhMetadata

gh = GhMetadata(
    # repo_url="github.com/PyGithub/PyGithub", 
    owner=owner, 
    repo=repo, 
    verbose=True
)
readme = gh.get_readme()
topic = gh.get_github_topic()

In [18]:
readme_emb = get_emb_batch([readme])
topic_emb = get_emb_batch(topic)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def calc_kw_distance(query_array: List[List[float]], kw_emb_df: pd.DataFrame):   

    if 'embeddings' not in kw_emb_df.columns:
        raise ValueError("kw_emb_df must have an 'embeddings' column.")
    if not query_array:
        raise ValueError("query_array must not be empty.")
    
    df = kw_emb_df.copy()
    distances = cosine_similarity(
        np.vstack(df['embeddings'].tolist()), 
        np.vstack(query_array)
    )
    if distances.shape[1] > 1:
        print('Multiple queries found, averaging similartiy...')
        distances = np.mean(distances, axis=1)
    
    df['distance'] = distances
    return df.drop(['embeddings'], axis=1)
    

In [21]:
score_df = calc_kw_distance(
    readme_emb,
    kw_emb_df
)

In [23]:
score_df.groupby(['category_name'])['distance'].mean()

category_name
Authentication & Authorization      0.083741
Configuration Management            0.106546
Cryptography                        0.068960
Database Interaction                0.090747
File & Data Handling                0.074846
Logging & Monitoring                0.128287
Machine Learning & AI Frameworks    0.242841
Memory Management                   0.056111
Network Communication               0.069412
Operating System Interaction        0.089586
Testing & Quality Validation        0.169323
Web Framework Components            0.072806
Name: distance, dtype: float64

In [None]:
from sklearn.cluster import OPTICS

class Model:

    def _curate_group_id_w_outlier(self, a):
        "input assumed the group ids came clustering result from sorted scores in descending order"
        increment = 0
        curated = []
        for i, group_id in enumerate(a):
            if group_id == -1:
                if i == 0:
                    increment = 1
                    curated.append(group_id + increment)
                else:
                    curated.append(group_id)
            else:
                curated.append(group_id + increment)

        return curated


    def algo_grouping(self, scores: List[float]):
        for prev, curr in zip(scores, scores[1:]):
            assert prev >= curr, "scores input to be descending order"
        
        # model = DBSCAN(eps=0.01, min_samples=2)
        model = OPTICS(min_samples=2, xi=0.05)
        grouping = model.fit_predict(np.array(scores).reshape(-1, 1))
        return self._curate_group_id_w_outlier(grouping)


    def std_grouping(self, scores: List[float]):
        # assumed the scores are sorted decendingly
        sorted_scores = sorted(scores)[::-1]
        score_gaps = [sorted_scores[i]-sorted_scores[i+1] for i in range(len(sorted_scores)-1)]

        # calculate gap standard deviation for threshold
        threshold = np.std(score_gaps)

        # iterate for grouping
        current_score = sorted_scores[0]
        grouped_categories = [0]
        counter = 0
        for i, s in enumerate(sorted_scores):
            if i == 0:
                continue
            if abs(current_score - s) > threshold:
                counter += 1
            current_score = s
            grouped_categories.append(counter)
        return grouped_categories
    
    
    def consolidate_grouping(self, row):
        "To consolidate std and algo grouping"

        std_grouping = np.array(row['std_grouping'])
        algo_grouping = np.array(row['algo_grouping'])

        if np.sum(algo_grouping==0) <= 3:
            return algo_grouping
        if np.sum(algo_grouping==0) <= np.sum(std_grouping==0):
            return algo_grouping

        return std_grouping


In [None]:
df = score_df.groupby(['category_name'])['distance'].mean().reset_index().sort_values('distance', ascending=False)
M = Model()
df['algo_grouping'] = M.algo_grouping(df['distance'].tolist())
df['std_grouping'] = M.std_grouping(df['distance'].tolist())
df['grouping'] = df.apply(M.consolidate_grouping, axis=1)

# pypi

In [11]:
from src.pypi_helper import PypiMetadata

In [12]:
pp = PypiMetadata('torch')
pp_meta = pp.get_output()
pp_meta

{'status': 'success',
 'description': '\n--------------------------------------------------------------------------------\n\nPyTorch is a Python package that provides two high-level features:\n- Tensor computation (like NumPy) with strong GPU acceleration\n- Deep neural networks built on a tape-based autograd system\n\nYou can reuse your favorite Python packages such as NumPy, SciPy, and Cython to extend PyTorch when needed.\n\nOur trunk health (Continuous Integration signals) can be found at [hud.pytorch.org](https://hud.pytorch.org/ci/pytorch/pytorch/main).\n\n\n\n- [More About PyTorch](#more-about-pytorch)\n  - [A GPU-Ready Tensor Library](#a-gpu-ready-tensor-library)\n  - [Dynamic Neural Networks: Tape-Based Autograd](#dynamic-neural-networks-tape-based-autograd)\n  - [Python First](#python-first)\n  - [Imperative Experiences](#imperative-experiences)\n  - [Fast and Lean](#fast-and-lean)\n  - [Extensions Without Pain](#extensions-without-pain)\n- [Installation](#installation)\n  - 