In [1]:
import json
import pandas as pd
import tqdm
import numpy as np

In [2]:
%cd ..

/home/kuba/Projects/github_search


In [3]:
python_files_df = pd.read_csv('data/sample_python_files.csv')

In [4]:
python_files_df.head()

Unnamed: 0,owner,repo_name,file_path,content,sha
0,BoiseState,bookdata-tools,bookdata/__init__.py,import os\nimport sys\nfrom pathlib import Pat...,dea642f49fd7e0a90133627e4f96193ca6d9627e
1,BoiseState,bookdata-tools,bookdata/db.py,import os\nimport sys\nimport re\nimport time\...,d369c1dbd5f18ed14e8735033877cb07fd4839f2
2,BoiseState,bookdata-tools,bookdata/dvcpatch.py,"""""""\nSupport code for our custom DVC remote.\n...",c1305b94d20af17361b26505c9799a6c0652b882
3,BoiseState,bookdata-tools,bookdata/graph.py,"""""""\nUtiltiies for loading & working with the ...",f2ab933d0986671030e8343cb6fc8d6255bd04aa
4,BoiseState,bookdata-tools,bookdata/schema.py,"""""""\nData schema information for the book data...",16f3874b8f8748971303648b94e3517ebe924d3f


In [5]:
from github_search import parsing_imports

In [6]:
example_file_content = python_files_df.iloc[0]['content']
print(example_file_content)

import os
import sys
from pathlib import Path
import pathlib
import logging

_simple_format = logging.Formatter('{asctime} [{levelname:7s}] {name} {message}',
                                   datefmt='%Y-%m-%d %H:%M:%S',
                                   style='{')

_initialized = False

data_dir = Path('data')
tgt_dir = Path('target')
bin_dir = tgt_dir / 'release'
bdtool = bin_dir / 'bookdata'


def setup(debug=False):
    global _initialized
    ch = logging.StreamHandler(sys.stderr)
    ch.setLevel(logging.DEBUG if debug else logging.INFO)
    ch.setFormatter(_simple_format)

    root = logging.getLogger()
    root.addHandler(ch)
    root.setLevel(logging.INFO)

    logging.getLogger('dvc').setLevel(logging.ERROR)
    logging.getLogger('lenskit').setLevel(logging.DEBUG)
    logging.getLogger('').setLevel(logging.DEBUG)
    root.debug('log system configured')
    _initialized = True


def script_log(name, debug=False):
    """
    Initialize logging and get a logger for a script.


In [7]:
list(parsing_imports.get_modules(example_file_content))

['os', 'sys', 'pathlib', 'pathlib', 'logging']

In [8]:
def get_modules_list(file_content):
    return ' '.join(list(parsing_imports.get_modules(file_content)))

In [9]:
module_lists = []


for content in tqdm.tqdm(python_files_df['content'].dropna()):
    try:
        module_lists.append(list(parsing_imports.get_modules(content)))
    except SyntaxError:
        pass

100%|██████████| 34678/34678 [00:39<00:00, 872.84it/s] 


In [10]:
module_lists[:5]

[['os', 'sys', 'pathlib', 'pathlib', 'logging'],
 ['os',
  'sys',
  're',
  'time',
  'logging',
  'hashlib',
  'threading',
  'configparser',
  'pathlib',
  'contextlib',
  'datetime',
  'typing',
  'typing',
  'docopt',
  'natural',
  'pandas',
  'more_itertools',
  'psycopg2',
  'psycopg2',
  'psycopg2',
  'psycopg2',
  'sqlalchemy',
  'sqlparse',
  'git'],
 ['logging', 'urllib', 'hashlib', 'dvc', 'dvc', 'dvc'],
 ['logging', 'pandas', 'numpy', 'graph_tool', 'schema'],
 ['pandas']]

In [11]:
module_import_strings = [
    ' '.join(modules)
    for modules in module_lists
]

In [12]:
from sklearn import feature_extraction, decomposition

In [13]:
vectorizer = feature_extraction.text.CountVectorizer(min_df=3)

In [14]:
occurrence_matrix = vectorizer.fit_transform(module_import_strings)

In [15]:
cooccurrence_matrix = occurrence_matrix.T @ occurrence_matrix

In [16]:
cooccurrence_matrix

<1657x1657 sparse matrix of type '<class 'numpy.int64'>'
	with 54755 stored elements in Compressed Sparse Column format>

In [17]:
nmf = decomposition.NMF(n_components=50)

In [78]:
module_vectors = nmf.fit_transform(cooccurrence_matrix)
module_vectors = module_vectors / (np.linalg.norm(module_vectors, axis=1) + 1e-12)[:,np.newaxis]

In [79]:
vectorizer.get_feature_names()

['__future__',
 '__version__',
 '_base',
 '_binary',
 '_caffe',
 '_constants',
 '_dynet',
 '_ext',
 '_init_paths',
 '_input_manipulations',
 '_pickle',
 '_policies',
 '_preprocessing',
 '_util',
 '_utils',
 '_version',
 'abc',
 'absl',
 'abstract_kernel',
 'abstract_transformation',
 'accuracy',
 'acol',
 'acquisition',
 'acquisition_function',
 'acquisition_functions',
 'action_detection',
 'activations',
 'activition',
 'adaptive_avgmax_pool',
 'addressing',
 'adet',
 'adversarial_losses',
 'agent',
 'agents',
 'ai_models',
 'airfoil_nuft',
 'airnet',
 'aix360',
 'albumentations',
 'alexnet',
 'align',
 'allennlp',
 'amr',
 'anchor',
 'anchor_generator',
 'anchor_head',
 'anchor_heads',
 'anchor_target',
 'ang',
 'angle',
 'ann_app_utils',
 'annotation',
 'antlr4',
 'anytime_models',
 'apex',
 'api',
 'approval_simulation_common',
 'approver',
 'approx_max_iou_assigner',
 'architecture',
 'architectures',
 'archs',
 'arcsim',
 'arg_parser',
 'argparse',
 'args',
 'args_gan',
 'argume

In [80]:
top_module_idxs = np.array(occurrence_matrix.sum(axis=0))[0].argsort()[::-1]
modules = vectorizer.get_feature_names()

[modules[i] for i in top_module_idxs]

['tensorflow',
 'torch',
 '__future__',
 'numpy',
 'os',
 'models',
 'utils',
 'project_settings',
 'common',
 'collections',
 'data_loaders',
 'sys',
 'object_detection',
 'math',
 'official',
 'time',
 'keras',
 'random',
 'argparse',
 'json',
 'pdb',
 'typing',
 'six',
 'mxnet',
 'chainer',
 'logging',
 'matplotlib',
 'allennlp',
 'nltk',
 'absl',
 'sklearn',
 'torchvision',
 're',
 'reagent',
 'gym',
 'scipy',
 'rlpyt',
 'functools',
 'tensorboard',
 'copy',
 'pil',
 'pickle',
 'simulator',
 'pylearn2',
 'cv2',
 'itertools',
 'shutil',
 'ray',
 'unittest',
 'chainerrl',
 'pandas',
 'tqdm',
 'maskrcnn_benchmark',
 'mmdet',
 'abc',
 'evaluation',
 'glob',
 'resnet',
 'util',
 'model',
 'datetime',
 'subprocess',
 'config',
 'tensorboardx',
 'google',
 'theano',
 'nets',
 'tempfile',
 'lib',
 'src',
 'threading',
 'datasets',
 'data',
 'tensorpack',
 'pretrain_classifier',
 'baselines',
 'gym_wmgds',
 'mmcv',
 'multiprocessing',
 'net',
 'pathlib',
 'gp_input_noise',
 'fragile',
 'pyt

In [90]:
example_module_idx = 1

In [82]:
modules[top_module_idxs[example_module_idx]], top_module_idxs[example_module_idx] 

('tensorflow', 1428)

In [91]:
example_module_vector = module_vectors[top_module_idxs[example_module_idx]]
example_module_vector_similarities = example_module_vector @ module_vectors.T

In [92]:
most_similar_vector_idxs = example_module_vector_similarities.argsort()[::-1][:10]

In [93]:
for idx in most_similar_vector_idxs:
    print(idx, modules[idx], round(example_module_vector_similarities[idx], 4))

1511 torch 1.0
304 da 0.9581
544 fpn 0.955
979 odeint_ext 0.9521
505 fc 0.9509
1517 torchdiffeq 0.9507
861 mlp 0.9485
820 mask_head 0.9483
71 assign_result 0.9481
104 base_assigner 0.9481


In [94]:
(module_vectors[878] @ module_vectors.T).argmax()

878

In [96]:
module_vectors[82]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [97]:
np.linalg.norm(example_module_vector)

0.9999999999999963

In [98]:
np.linalg.norm(module_vectors, axis=0)

array([ 4.9285245 ,  2.51998932, 10.06620588, 10.21702182,  6.99901937,
        5.29225722,  6.19729663,  2.47051739,  1.92942442,  3.00202667,
       15.16787264,  1.27572579,  2.61083607,  2.28482105,  1.54045392,
        1.72379295,  1.24017734,  1.23819416,  4.5681187 , 12.89764694,
        7.03096055,  8.29186618,  2.90427253,  6.005073  ,  2.28668638,
        4.40685399,  5.2138281 ,  5.1378346 ,  3.15023835,  4.6809019 ,
        2.21824681,  8.17977789,  8.55777327, 11.20357861,  2.13627984,
        1.94247277,  5.1244109 ,  8.55349797,  2.03115629,  1.73919568,
        3.16441762,  2.90181255,  4.54752257,  1.72331321,  3.81985411,
        1.66714192,  5.78389384,  3.32680571,  7.07231066,  3.97553443])