In [1]:
#export typical_file_parts

In [2]:
#export
import pandas as pd
import numpy as np
from github_search import python_tokens
from mlutil.feature_extraction import embeddings

In [3]:
%cd ..

/home/kuba/Projects/github_search


In [4]:
python_files_df = pd.read_csv('data/python_files.csv', nrows=20000)

In [5]:
example_file = python_files_df['content'].iloc[0]

In [6]:
def select_class_names(lines):
    return [line.strip() for line in lines if line.lstrip().startswith('class ') and line.rstrip().endswith(':')]


def select_function_names(lines):
    return [line.strip() for line in lines if line.lstrip().startswith('def ')]


def select_lines(text):
    lines = text.split('\n')
    return select_function_names(lines) + select_class_names(lines)

In [7]:
#export
import io
import tokenize
import keyword
import re


PYTHON_KEYWORDS = set(keyword.kwlist)


def tokenize_snakecase(identifier):
    return identifier.split('_')


def tokenize_camelcase(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]


def tokenize_python(identifier, lowercase=False):
    if '_' in identifier:
        tokens = tokenize_snakecase(identifier)
    else:
        tokens = tokenize_camelcase(identifier)
    return [
        t.lower()
        for t in tokens
    ]
    

def get_file_variable_token_set(file_text, min_token_length=2, lowercase=True):
    token_infos = list(tokenize.generate_tokens(io.StringIO(file_text).readline))
    raw_tokens = [t.string for t in token_infos if t.type == 1]
    all_tokens = (
        tokenize_python(t, lowercase) for t in raw_tokens
    )
    all_tokens = [
        token
        for tokens in all_tokens
        for token in tokens
        if len(token) > min_token_length and not token in PYTHON_KEYWORDS
    ]
    return set(all_tokens)


def maybe_get_file_variable_token_string(file_text, min_token_length=2):
    try:
        tokens = get_file_variable_token_set(file_text)
    except:
        return None
    return ' '.join(tokens)

In [8]:
import transformers

In [9]:
python_files_df.iloc[:10000]['repo_name'].value_counts()

models                           7634
transformers                      744
DeepPavlov                        407
GIZMO-CMZ                         105
faster-rcnn-fcn                    93
                                 ... 
DeepPrivInf2017                     2
BiSeNet-Implementation              1
Machine-Learning-CS539              1
stain-normalization-isbi-2017       1
kNN_SWin                            1
Name: repo_name, Length: 64, dtype: int64

In [10]:
example_selected_lines = select_lines(' '.join(python_files_df[python_files_df['repo_name'] == 'transformers']['content'].dropna()))

In [11]:
def get_selected_lines_and_repos(repos, file_contents):
    
    selected_lines_by_repo = {
        repo: np.unique(select_lines(' '.join(file_contents[repos == repo].dropna())))
        for repo in repos.unique()
    }
    selected_lines_by_repo = {
        repo: lines
        for (repo, lines) in selected_lines_by_repo.items()
        if len(lines) > 0
    }
    line_repos = [k for k in selected_lines_by_repo.keys() for __ in selected_lines_by_repo[k]]
    all_selected_lines = [line for lines in selected_lines_by_repo.values() for line in lines]
    return pd.DataFrame({'repo': line_repos, 'line': all_selected_lines})

In [12]:
selected_lines_df = get_selected_lines_and_repos(python_files_df['repo_name'], python_files_df['content'])

In [13]:
import mlutil

In [14]:
%load_ext autoreload

In [15]:
from mlutil.feature_extraction import embeddings 

In [16]:
codebert_vectorizer = embeddings.TransformerVectorizer('microsoft/codebert-base')

In [17]:
example_selected_lines = list(np.unique(example_selected_lines))

In [18]:
example_line_embeddings = codebert_vectorizer.transform(example_selected_lines)

100%|██████████| 53/53 [00:02<00:00, 22.76it/s]


In [19]:
example_line_embeddings.shape

(6708, 768)

In [20]:
#export
from sklearn import cluster, metrics, mixture


def select_centroid_prototype_indices(features, n_prototypes=10):
    n_prototypes = min(len(features), n_prototypes)
    clusterer = cluster.KMeans(n_prototypes)
    clusterer.fit(features)
    cluster_distances = metrics.pairwise.euclidean_distances(clusterer.cluster_centers_, features)
    prototype_indices = np.unique(cluster_distances.argmin(axis=1))
    return prototype_indices

In [21]:
n_prototypes = 10 
data = example_line_embeddings

In [22]:
frozenset(prototype_lines_by_repo_types.values())

NameError: name 'prototype_lines_by_repo_types' is not defined

In [None]:
prototype_embeddings_by_repo['transformers']

In [None]:
#export
import attr
from sklearn import cluster, metrics, mixture


def select_centroid_prototype_indices(features, n_prototypes=10):
    n_prototypes = min(len(features), n_prototypes)
    clusterer = cluster.KMeans(n_prototypes)
    clusterer.fit(features)
    cluster_distances = metrics.pairwise.euclidean_distances(clusterer.cluster_centers_, features)
    prototype_indices = np.unique(cluster_distances.argmin(axis=1))
    return prototype_indices


@attr.s
class PrototypeSelector:
    
    vectorizer = attr.ib()
    n_prototypes = attr.ib(default=10)
    
    def fit_prototypes(self, data, labels):
        self.prototypes = {}
        data = np.array(data)
        labels = pd.Series(labels)
        for label in tqdm.tqdm(set(labels)):
            label_data = data[labels == label]
            label_features = self.vectorizer.transform(list(label_data))
            label_prototype_indices = select_centroid_prototype_indices(label_features, self.n_prototypes)
            self.prototypes[label] = np.array(label_data)[label_prototype_indices]

In [None]:
codebert_selector = prototype_selection.PrototypeSelector(codebert_vectorizer)

In [None]:
codebert_selector.fit_prototypes(selected_lines_df['line'], selected_lines_df['repo'])

In [None]:
codebert_selector.prototypes