In [1]:
import json
import pandas as pd
import tqdm
import numpy as np

In [2]:
%cd ..

/home/kuba/Projects/github_search


In [3]:
python_files_df = pd.read_csv('data/sample_python_files.csv')

In [4]:
python_files_df.head()

Unnamed: 0,owner,repo_name,file_path,content,sha
0,BoiseState,bookdata-tools,bookdata/__init__.py,import os\nimport sys\nfrom pathlib import Pat...,dea642f49fd7e0a90133627e4f96193ca6d9627e
1,BoiseState,bookdata-tools,bookdata/db.py,import os\nimport sys\nimport re\nimport time\...,d369c1dbd5f18ed14e8735033877cb07fd4839f2
2,BoiseState,bookdata-tools,bookdata/dvcpatch.py,"""""""\nSupport code for our custom DVC remote.\n...",c1305b94d20af17361b26505c9799a6c0652b882
3,BoiseState,bookdata-tools,bookdata/graph.py,"""""""\nUtiltiies for loading & working with the ...",f2ab933d0986671030e8343cb6fc8d6255bd04aa
4,BoiseState,bookdata-tools,bookdata/schema.py,"""""""\nData schema information for the book data...",16f3874b8f8748971303648b94e3517ebe924d3f


In [5]:
from github_search import parsing_imports

In [6]:
example_file_content = python_files_df.iloc[0]['content']
print(example_file_content)

import os
import sys
from pathlib import Path
import pathlib
import logging

_simple_format = logging.Formatter('{asctime} [{levelname:7s}] {name} {message}',
                                   datefmt='%Y-%m-%d %H:%M:%S',
                                   style='{')

_initialized = False

data_dir = Path('data')
tgt_dir = Path('target')
bin_dir = tgt_dir / 'release'
bdtool = bin_dir / 'bookdata'


def setup(debug=False):
    global _initialized
    ch = logging.StreamHandler(sys.stderr)
    ch.setLevel(logging.DEBUG if debug else logging.INFO)
    ch.setFormatter(_simple_format)

    root = logging.getLogger()
    root.addHandler(ch)
    root.setLevel(logging.INFO)

    logging.getLogger('dvc').setLevel(logging.ERROR)
    logging.getLogger('lenskit').setLevel(logging.DEBUG)
    logging.getLogger('').setLevel(logging.DEBUG)
    root.debug('log system configured')
    _initialized = True


def script_log(name, debug=False):
    """
    Initialize logging and get a logger for a script.


In [7]:
list(parsing_imports.get_modules(example_file_content))

['os', 'sys', 'pathlib', 'pathlib', 'logging']

In [8]:
def get_modules_list(file_content):
    return ' '.join(list(parsing_imports.get_modules(file_content)))

In [9]:
module_lists = []


for content in tqdm.tqdm(python_files_df['content'].dropna()):
    try:
        module_lists.append(list(parsing_imports.get_modules(content)))
    except SyntaxError:
        pass

100%|██████████| 34678/34678 [00:39<00:00, 884.89it/s] 


In [10]:
module_lists[:5]

[['os', 'sys', 'pathlib', 'pathlib', 'logging'],
 ['os',
  'sys',
  're',
  'time',
  'logging',
  'hashlib',
  'threading',
  'configparser',
  'pathlib',
  'contextlib',
  'datetime',
  'typing',
  'typing',
  'docopt',
  'natural',
  'pandas',
  'more_itertools',
  'psycopg2',
  'psycopg2',
  'psycopg2',
  'psycopg2',
  'sqlalchemy',
  'sqlparse',
  'git'],
 ['logging', 'urllib', 'hashlib', 'dvc', 'dvc', 'dvc'],
 ['logging', 'pandas', 'numpy', 'graph_tool', 'schema'],
 ['pandas']]

In [11]:
module_import_strings = [
    ' '.join(modules)
    for modules in module_lists
]

In [12]:
from sklearn import feature_extraction, decomposition

In [13]:
vectorizer = feature_extraction.text.CountVectorizer(min_df=5, binary=True)

In [14]:
occurrence_matrix = vectorizer.fit_transform(module_import_strings)

In [15]:
cooccurrence_matrix = occurrence_matrix.T @ occurrence_matrix

In [16]:
cooccurrence_matrix

<984x984 sparse matrix of type '<class 'numpy.int64'>'
	with 41196 stored elements in Compressed Sparse Column format>

In [17]:
nmf = decomposition.NMF(n_components=50, alpha=0.01)

In [18]:
module_vectors = nmf.fit_transform(cooccurrence_matrix.todense())
module_vectors = module_vectors / (np.linalg.norm(module_vectors, axis=1) + 1e-12)[:,np.newaxis]

In [19]:
vectorizer.get_feature_names()

['__future__',
 '_base',
 '_binary',
 '_caffe',
 '_constants',
 '_ext',
 '_init_paths',
 '_pickle',
 '_version',
 'abc',
 'absl',
 'abstract_kernel',
 'abstract_transformation',
 'accuracy',
 'acquisition',
 'acquisition_functions',
 'action_detection',
 'activations',
 'adet',
 'agent',
 'agents',
 'airnet',
 'aix360',
 'albumentations',
 'alexnet',
 'allennlp',
 'amr',
 'anchor_generator',
 'anchor_head',
 'ann_app_utils',
 'antlr4',
 'anytime_models',
 'apex',
 'approver',
 'architecture',
 'architectures',
 'arcsim',
 'arg_parser',
 'argparse',
 'args',
 'arguments',
 'array',
 'assign_result',
 'assign_sampling',
 'ast',
 'astunparse',
 'asyncio',
 'atexit',
 'attacks',
 'attention',
 'attmodel',
 'attr',
 'augment_model',
 'augment_model_final',
 'augment_model_new',
 'augmentation_transforms',
 'automl',
 'backbone',
 'backbones',
 'base',
 'base64',
 'base_assigner',
 'base_dataset',
 'base_detector',
 'base_model',
 'base_options',
 'base_sampler',
 'base_sens',
 'base_trainer

In [20]:
top_module_idxs = np.array(occurrence_matrix.sum(axis=0))[0].argsort()[::-1][:10]
modules = vectorizer.get_feature_names()

[modules[i] for i in top_module_idxs]

['numpy',
 'os',
 'tensorflow',
 'torch',
 '__future__',
 'sys',
 'collections',
 'utils',
 'math',
 'time']

In [21]:
example_module_idx = 5

In [22]:
modules[top_module_idxs[example_module_idx]], top_module_idxs[example_module_idx] 

('sys', 820)

In [23]:
example_module_vector = module_vectors[top_module_idxs[example_module_idx]]
example_module_vector_similarities = example_module_vector @ module_vectors.T

In [24]:
most_similar_vector_idxs = example_module_vector_similarities.argsort()[::-1][:10]

In [25]:
for idx in most_similar_vector_idxs:
    print(modules[idx], round(example_module_vector_similarities[idx], 4))

sys 1.0
sphinx_rtd_theme 0.8645
version 0.8589
score 0.8528
surgery 0.8514
site 0.8055
pointnet_util 0.804
python_backend 0.7943
graphviz 0.7935
tf_util 0.7915


In [26]:
(module_vectors[878] @ module_vectors.T).argmax()

878

In [27]:
module_vectors[100]

array([0.        , 0.        , 0.        , 0.        , 0.67290559,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.11910143, 0.03476049, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.24431886, 0.        , 0.12185708, 0.        , 0.        ,
       0.        , 0.        , 0.05280244, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.61156985, 0.        , 0.        , 0.28365115, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

# Word2Vec format

Functions for writing a numpy matrix with specified vocabulary as Word2Vec format.

This is useful as such format can be loaded using gensim KeyedVectors class.

In [32]:
#export


def _word_vectors_to_word2vec_format_generator(vocabulary, word_vectors):
    for (word, vector) in zip(vocabulary, word_vectors):
        yield word + ' ' + ' '.join([str('{:.5f}'.format(f)) for f in vector])

        
def store_word_vectors(words, word_vectors, file_name):
    with open(file_name, 'w') as f:
        f.write(str(len(words)) + ' ' + str(word_vectors.shape[1]) + '\n')
        for line in _word_vectors_to_word2vec_format_generator(words, module_vectors):
            f.write(line + '\n')

In [33]:
store_word_vectors(modules, module_vectors, 'data/nmf_module_vectors.txt')

In [34]:
import gensim

module_keyed_vectors = gensim.models.KeyedVectors.load_word2vec_format('data/nmf_module_vectors.txt')

In [35]:
module_keyed_vectors.most_similar('torch')

  return (m / dist).astype(REAL)


[('assign_result', 0.8605263829231262),
 ('dsr_model', 0.8605263829231262),
 ('box_head', 0.8605263829231262),
 ('base_assigner', 0.8605263829231262),
 ('signatory', 0.86052006483078),
 ('scale', 0.8596498966217041),
 ('efficientnet_pytorch', 0.8589223027229309),
 ('basepifunet', 0.8559807538986206),
 ('base_sampler', 0.855023205280304),
 ('base_sens', 0.8546276688575745)]