In [1]:
# default_exp python_tokens

In [2]:
# export
import fasttext
import pandas as pd
import tokenize
import io
import keyword
import gensim
from mlutil.feature_extraction import embeddings
from sklearn import metrics

In [3]:
%cd ..

/home/kuba/Projects/github_search


In [4]:
python_files_df = pd.read_csv("data/python_files.csv")

In [5]:
python_files_df.shape

(560183, 5)

In [6]:
python_files_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560183 entries, 0 to 560182
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   owner      560183 non-null  object
 1   repo_name  560183 non-null  object
 2   file_path  560183 non-null  object
 3   content    546484 non-null  object
 4   sha        560183 non-null  object
dtypes: object(5)
memory usage: 21.4+ MB


In [7]:
file_text = python_files_df.content[0]

In [8]:
s = "abc_"

In [9]:
s.strip("_")

'abc'

In [10]:
# export
import re


PYTHON_KEYWORDS = set(keyword.kwlist)


def tokenize_snakecase(identifier):
    return identifier.split("_")


def tokenize_camelcase(identifier):
    matches = re.finditer(
        ".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier
    )
    return [m.group(0) for m in matches]


def tokenize_python(identifier, lowercase=False):
    if "_" in identifier:
        tokens = tokenize_snakecase(identifier)
    else:
        tokens = tokenize_camelcase(identifier)
    return [t.lower() for t in tokens]


def get_file_variable_token_set(file_text, min_token_length=2, lowercase=True):
    token_infos = list(tokenize.generate_tokens(io.StringIO(file_text).readline))
    raw_tokens = [t.string for t in token_infos if t.type == 1]
    all_tokens = (tokenize_python(t, lowercase) for t in raw_tokens)
    all_tokens = [
        token
        for tokens in all_tokens
        for token in tokens
        if len(token) > min_token_length and not token in PYTHON_KEYWORDS
    ]
    return set(all_tokens)


def maybe_get_file_variable_token_string(file_text, min_token_length=2):
    try:
        tokens = get_file_variable_token_set(file_text)
    except:
        return None
    return " ".join(tokens)

In [11]:
tokens = get_file_variable_token_set(python_files_df.content[0])

In [12]:
python_files_df.content[0]

"#!/usr/bin/env python3\nimport subprocess\nimport shlex\nimport re\nimport os\nimport argparse\n\nOKGREEN = '\\033[92m'\nFAIL = '\\033[91m'\nENDC = '\\033[0m'\n\nparser = argparse.ArgumentParser()\nparser.add_argument('--no-gpu', action='store_true')\nparser.add_argument('--gpu-id', type=int)\nparser.add_argument('dirs', nargs='*')\nargs = parser.parse_args()\n\nextra_params = []\nif args.no_gpu:\n    extra_params.append('--no-gpu')\nif args.gpu_id is not None:\n    extra_params += ['--gpu-id', str(args.gpu_id)]\n\ndef failure(message):\n    print('{}failure: {}{}'.format(FAIL, message, ENDC))\ndef success(message):\n    print('{}success: {}{}'.format(OKGREEN, message, ENDC))\n\nlog_file = os.path.join('tests', 'log.txt')\n\ntry:\n    os.remove(log_file)\nexcept FileNotFoundError:\n    pass\n\n\ndef get_best_score(log_file):\n    scores = []\n    with open(log_file) as f:\n        for line in f:\n            score_ = re.search(r' (score|bleu|ter|loss|cer|wer|bleu1)=(.*?) ', line + ' '

In [13]:
" ".join(set(tokens))

'argparse path output dirs max score success append str best stdout basename called subprocess action endc join get float search len file parse remove line nargs found check print decode config okgreen argument split args format listdir type scores none strip fail error write name process log isdir stderr parser shlex message params run extra min open add gpu int group dir failure'

In [14]:
fasttext_gensim_filename = "data/wiki-news-300d-1M.vec"
fasttext_filename = "data/cc.en.300.bin"

In [15]:
%%time
fasttext_model = fasttext.load_model(fasttext_filename)

CPU times: user 1.54 s, sys: 1.5 s, total: 3.04 s
Wall time: 3.04 s




In [16]:
%%time
fasttext_gensim_model = gensim.models.FastText.load_fasttext_format(fasttext_filename)



CPU times: user 37.8 s, sys: 3.38 s, total: 41.1 s
Wall time: 41.2 s


In [17]:
fasttext_model.get_subwords("base_name")

(['<base', 'base_', 'ase_n', 'se_na', 'e_nam', '_name', 'name>'],
 array([3601990, 3142405, 2240913, 3154387, 2909247, 3169229, 2929672]))

In [18]:
%load_ext autoreload
%autoreload 2

In [19]:
kv = embeddings.load_gensim_embedding_model("glove-twitter-50")

In [20]:
kv.wv.vocab

  kv.wv.vocab


{'<user>': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e550>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e9a0>,
 ':': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e940>,
 'rt': <gensim.models.keyedvectors.Vocab at 0x7f9ca899eb20>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7f9ca899ebe0>,
 '<repeat>': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e820>,
 '<hashtag>': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e370>,
 '<number>': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e9d0>,
 '<url>': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e7c0>,
 '!': <gensim.models.keyedvectors.Vocab at 0x7f9ca899ec70>,
 'i': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e670>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f9ca899e580>,
 '"': <gensim.models.keyedvectors.Vocab at 0x7f9788854e20>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f9788854d30>,
 '?': <gensim.models.keyedvectors.Vocab at 0x7f9788854fd0>,
 'you': <gensim.models.keyedvectors.Vocab at 0x7f9788854f10>,
 'to

In [21]:
fasttext_gensim_model.wv.vocab

{',': <gensim.models.keyedvectors.Vocab at 0x7f9ca3ef1df0>,
 'the': <gensim.models.keyedvectors.Vocab at 0x7f9ca3ef1ca0>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f9ca3ef1f10>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7f9ca3ef1f70>,
 'to': <gensim.models.keyedvectors.Vocab at 0x7f9ca3ef1fd0>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f9beac42070>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f9beac420d0>,
 '</s>': <gensim.models.keyedvectors.Vocab at 0x7f9beac42130>,
 'in': <gensim.models.keyedvectors.Vocab at 0x7f9beac42190>,
 'is': <gensim.models.keyedvectors.Vocab at 0x7f9beac421f0>,
 ':': <gensim.models.keyedvectors.Vocab at 0x7f9beac42250>,
 'I': <gensim.models.keyedvectors.Vocab at 0x7f9beac422b0>,
 'for': <gensim.models.keyedvectors.Vocab at 0x7f9beac42310>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7f9beac42370>,
 ')': <gensim.models.keyedvectors.Vocab at 0x7f9beac423d0>,
 '"': <gensim.models.keyedvectors.Vocab at 0x7f9beac42430>,
 '(': <gensim.models.key

In [22]:
emb_model = embeddings.AverageWordEmbeddingsVectorizer(fasttext_gensim_model)

In [23]:
fasttext_model.get_subwords("base name")

(['<base', 'base ', 'ase n', 'se na', 'e nam', ' name', 'name>'],
 array([3601990, 2046056, 3827154, 3993574, 2708980, 3665660, 2929672]))

In [24]:
fasttext_gensim_model.wv.similarity("base name", "base_name")

0.3374281

In [25]:
metrics.pairwise.cosine_similarity(
    [(fasttext_model["base_"] + fasttext_model["_name"])], [fasttext_model["base_name"]]
)

array([[0.65326834]], dtype=float32)

In [26]:
metrics.pairwise.cosine_similarity(
    [(fasttext_model["file_"] + fasttext_model["_name"])], [fasttext_model["file_name"]]
)

array([[0.6577994]], dtype=float32)

In [27]:
metrics.pairwise.cosine_similarity(
    [(fasttext_model["file"] + fasttext_model["name"])], [fasttext_model["filename"]]
)

array([[0.65270483]], dtype=float32)

In [28]:
metrics.pairwise.cosine_similarity(
    [fasttext_model["file name"]], [fasttext_model["filename"]]
)

array([[0.15743831]], dtype=float32)

In [29]:
fasttext_model.get_subwords("<base_")

(['<<bas', '<base', 'base_', 'ase_>'],
 array([3560493, 3601990, 3142405, 2915201]))

In [30]:
metrics.pairwise.cosine_similarity(
    [(fasttext_model["base name"])], [fasttext_model["base_name"]]
)

array([[0.33742806]], dtype=float32)

In [31]:
fasttext_gensim_model["gpu_id"]

  fasttext_gensim_model['gpu_id']


array([-4.89002280e-03, -5.09326421e-02, -6.47825655e-04,  1.68999881e-02,
        2.91821212e-02, -7.94018656e-02, -1.58723239e-02, -1.39547074e-02,
        3.30954827e-02, -3.94083597e-02,  6.53056502e-02,  1.74441896e-02,
        2.21977122e-02,  1.83864720e-02, -5.92345037e-02, -2.61607878e-02,
        1.42199378e-02, -1.59157179e-02, -2.17598081e-02,  2.42450852e-02,
        1.18860025e-02,  1.47764860e-02,  1.41005274e-02,  1.46108773e-03,
        6.47664741e-02,  3.70978331e-03, -5.28232493e-02, -7.49736279e-03,
        4.34475690e-02,  8.76194388e-02,  3.18839122e-03, -9.15196026e-04,
        2.14361511e-02,  1.03679765e-02,  2.98034940e-02, -1.99553110e-02,
       -2.31093001e-02,  5.11297323e-02,  7.60219526e-03, -3.07187531e-03,
        2.07061246e-02,  2.18089446e-02, -2.75542717e-02,  1.44086909e-02,
        4.88531925e-02, -1.72042754e-02, -4.72932588e-03,  4.08388600e-02,
       -1.11545548e-02,  5.15579525e-03, -2.47739572e-02,  3.27259377e-02,
       -4.80738692e-02, -

In [32]:
metrics.pairwise.cosine_similarity(
    [fasttext_gensim_model["gpu_"] + fasttext_gensim_model["_id"]],
    [fasttext_gensim_model["gpu_id"]],
)

  metrics.pairwise.cosine_similarity([fasttext_gensim_model['gpu_'] + fasttext_gensim_model['_id']], [fasttext_gensim_model['gpu_id']])


array([[0.16885671]], dtype=float32)

In [33]:
metrics.pairwise.cosine_similarity(
    [fasttext_gensim_model["gpu"]], [fasttext_gensim_model["gpu_id"]]
)

  metrics.pairwise.cosine_similarity([fasttext_gensim_model['gpu']], [fasttext_gensim_model['gpu_id']])


array([[0.09236824]], dtype=float32)

In [34]:
"basename" in fasttext_gensim_model.wv.vocab

True

In [35]:
# fasttext_gensim_model.similar_by_word('base name', restrict_vocab=['basename'])

In [37]:
%%time
python_files_df["tokens"] = python_files_df["content"].apply(
    maybe_get_file_variable_token_string
)

CPU times: user 19min 45s, sys: 2.06 s, total: 19min 47s
Wall time: 19min 48s


In [38]:
python_files_df.to_csv("data/python_files_with_tokens.csv", index=None)

## Tokenization with snake_case and camelCase

In [39]:
sample_indices = [
    440474,
    36781,
    354010,
    178387,
    419719,
    339569,
    152156,
    60910,
    222558,
    193026,
    122137,
    124549,
    89980,
    239861,
    414256,
    338844,
    10879,
    30335,
    21236,
    97763,
    311120,
    17600,
    287534,
    332230,
    506907,
    266700,
    503993,
    550945,
    85899,
    174301,
    556229,
    393373,
    293069,
    49026,
    398339,
    249483,
    379641,
    22215,
    36929,
    553842,
    424590,
    487848,
    109245,
    388546,
    355631,
    28722,
    278171,
    146009,
    547155,
    106539,
    75253,
    384480,
    305051,
    184188,
    492219,
    347782,
    394129,
    460031,
    528396,
    196243,
    131261,
    425323,
    83303,
    370216,
    343974,
    488911,
    2747,
    507266,
    200215,
    58190,
    60755,
    507065,
    427642,
    26555,
    92877,
    134734,
    367518,
    175535,
    169204,
    415873,
    384932,
    273915,
    63875,
    82378,
    140190,
    399600,
    78494,
    44481,
    479709,
    331486,
    86315,
    306864,
    163347,
    192107,
    313507,
    537156,
    390620,
    425218,
    146966,
    460223,
]

In [40]:
example_file_contents = python_files_df["content"].iloc[sample_indices]
print(example_file_contents.iloc[1])

# Copyright 2019 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Local feature aggregation similarity computation.

For more details, please refer to the paper:
"Detect-to-Retrieve: Efficient Regional Aggregation for Image Search",
Proc. CVPR'19 (https://arxiv.org/abs/1812.01584).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

from delf import aggregation_config_pb2

In [41]:
all_tokens = [
    token
    for contents in example_file_contents
    if type(contents) is str
    for token in get_file_variable_token_set(contents, lowercase=False)
]

In [42]:
snakecase_embeddable_tokens = [
    t
    for t in all_tokens
    if all(
        token_part in fasttext_gensim_model.wv.vocab
        for token_part in tokenize_snakecase(t)
    )
]
non_snakecase_embeddable_tokens = [
    t
    for t in all_tokens
    if not all(
        token_part in fasttext_gensim_model.wv.vocab
        for token_part in tokenize_snakecase(t)
    )
]

In [43]:
len(snakecase_embeddable_tokens) / len(all_tokens)

0.9365595770638471

In [44]:
embeddable_tokens = [
    t
    for t in all_tokens
    if all(
        token_part in fasttext_gensim_model.wv.vocab
        for token_part in tokenize_python(t)
    )
]
non_embeddable_tokens = [
    t
    for t in all_tokens
    if not all(
        token_part in fasttext_gensim_model.wv.vocab
        for token_part in tokenize_python(t)
    )
]

In [45]:
len(embeddable_tokens) / len(all_tokens)

0.9365595770638471

In [45]:
from github_search import parsing_imports

In [46]:
# export
from collections import namedtuple


Token = namedtuple("Token", ["name", "type"])


def _get_imports(file_content):
    root = ast.parse(file_content)

    for node in ast.iter_child_nodes(root):
        if isinstance(node, ast.Import):
            module = []
        elif isinstance(node, ast.ImportFrom) and not node.module is None:
            module = node.module.split(".")
        else:
            continue

        for n in node.names:
            yield Import(module, n.name.split("."), n.asname)

NameError: name 'exa' is not defined