In [1]:
#default_exp import_exploration

In [2]:
#export

import json
import pandas as pd
import tqdm
import numpy as np
import gensim

from github_search import parsing_imports
from sklearn import feature_extraction, decomposition

In [3]:
%cd ..

/home/kuba/Projects/github_search


In [4]:
%%time
python_files_df = pd.read_csv('data/python_files.csv').dropna()

CPU times: user 29.8 s, sys: 2.13 s, total: 32 s
Wall time: 32.8 s


In [5]:
python_files_df.head()

Unnamed: 0,owner,repo_name,file_path,content,sha
1,tensorflow,models,official/common/__init__.py,\n,8b137891791fe96927ad78e64b0aad7bded08bdc
2,tensorflow,models,official/common/distribute_utils.py,# Copyright 2018 The TensorFlow Authors. All R...,7ae8772840f52b7d5c0a492e13659ca58c3730ce
3,tensorflow,models,official/common/distribute_utils_test.py,# Copyright 2018 The TensorFlow Authors. All R...,124c1c6f1c529a559d3af644826de8904d2075b2
4,tensorflow,models,official/common/flags.py,# Lint as: python3\n# Copyright 2020 The Tenso...,ee86aca17b1e92a2aec476e70e23d90647e99ec9
5,tensorflow,models,official/common/registry_imports.py,# Copyright 2020 The TensorFlow Authors. All R...,021034568c4e10e4f46e59fd090b881fe8cd9d8b


In [6]:
python_files_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 888935 entries, 1 to 932381
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   owner      888935 non-null  object
 1   repo_name  888935 non-null  object
 2   file_path  888935 non-null  object
 3   content    888935 non-null  object
 4   sha        888935 non-null  object
dtypes: object(5)
memory usage: 40.7+ MB


In [10]:
example_file_content = python_files_df.iloc[1]['content']
print(example_file_content)

# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper functions for running models in a distributed setting."""

import json
import os
import random
import string

from absl import logging
import tensorflow as tf


def _collective_communication(all_reduce_alg):
  """Return a CollectiveCommunication based on all_reduce_alg.

  Args:
    all_reduce_alg: a string specifying which collective communication to pick,
      or None.

  Retu

In [11]:
list(parsing_imports.get_modules(example_file_content))

['json', 'os', 'random', 'string', 'absl', 'tensorflow']

In [12]:
#export


def get_modules_string(modules):
    public_modules = [mod for mod in modules if not mod[0] == '_']
    return ' '.join(public_modules)


def get_module_corpus(file_contents):
    module_lists = []

    for content in tqdm.tqdm(file_contents):
        try:
            module_lists.append(list(parsing_imports.get_modules(content)))
        except SyntaxError:
            pass
    module_import_strings = list(map(get_modules_string, module_lists))
    return module_import_strings

In [13]:
module_import_corpus = get_module_corpus(python_files_df['content'].dropna())

100%|██████████| 888935/888935 [17:39<00:00, 838.66it/s]  


In [89]:
vectorizer = feature_extraction.text.CountVectorizer(max_features=5000,binary=True)

In [90]:
%%time
occurrence_matrix = vectorizer.fit_transform(module_import_corpus)

CPU times: user 3.98 s, sys: 526 µs, total: 3.99 s
Wall time: 3.98 s


In [91]:
occurrence_matrix.shape

(856442, 5000)

In [92]:
cooccurrence_matrix = occurrence_matrix.T @ occurrence_matrix

In [93]:
nmf = decomposition.NMF(n_components=50, alpha=0.01)

In [94]:
%%time
module_vectors = nmf.fit_transform(cooccurrence_matrix.todense())

CPU times: user 2min 39s, sys: 1min 13s, total: 3min 52s
Wall time: 21.1 s


In [95]:
module_vectors = module_vectors / (np.linalg.norm(module_vectors, axis=1) + 1e-12)[:,np.newaxis]

In [96]:
vectorizer.get_feature_names()

['a00_common_functions',
 'a2_base_model',
 'a2c',
 'a2c_ppo_acktr',
 'a3c',
 'aapi',
 'abc',
 'abps',
 'abs_to_coco',
 'absl',
 'abstract',
 'abstract_feature',
 'abstract_game',
 'abstract_kernel',
 'abstract_transformation',
 'accountant',
 'accuracy',
 'acd',
 'acme',
 'action',
 'action_gap_rl',
 'actions',
 'activation',
 'activation_clustering',
 'activation_functions',
 'activations',
 'actor',
 'adabound',
 'adam',
 'adamod',
 'adamp',
 'adamw',
 'adapter',
 'adapters',
 'adaptive_avgmax_pool',
 'adaptive_input',
 'adaptive_softmax',
 'add_target_dataset',
 'adda',
 'addict',
 'ade',
 'ade20k',
 'adet',
 'adet_checkpoint',
 'adler',
 'admm',
 'adnc',
 'adopty',
 'advanced',
 'adversarial',
 'adversarial_attacks',
 'adversarial_evaluation',
 'adversarial_losses',
 'adversarial_perturbations',
 'adversarial_training',
 'advertorch',
 'ae',
 'aetools',
 'affine',
 'affine_grid',
 'agent',
 'agent_action_proto_pb2',
 'agent_info_proto_pb2',
 'agents',
 'aggregator',
 'aggregators'

In [97]:
top_module_idxs = np.array(occurrence_matrix.sum(axis=0))[0].argsort()[::-1][:10]
modules = vectorizer.get_feature_names()

[modules[i] for i in top_module_idxs]

['numpy',
 'os',
 'tensorflow',
 'torch',
 'sys',
 'argparse',
 'math',
 'time',
 'collections',
 'utils']

# Word2Vec format

Functions for writing a numpy matrix with specified vocabulary as Word2Vec format.

This is useful as such format can be loaded using gensim KeyedVectors class.

In [98]:
#export


def _word_vectors_to_word2vec_format_generator(vocabulary, word_vectors):
    for (word, vector) in zip(vocabulary, word_vectors):
        yield word + ' ' + ' '.join([str('{:.5f}'.format(f)) for f in vector])

        
def store_word_vectors(words, word_vectors, file_name):
    with open(file_name, 'w') as f:
        f.write(str(len(words)) + ' ' + str(word_vectors.shape[1]) + '\n')
        for line in _word_vectors_to_word2vec_format_generator(words, module_vectors):
            f.write(line + '\n')

In [99]:
store_word_vectors(modules, module_vectors, 'data/nmf_module_vectors.txt')

In [100]:
module_keyed_vectors = gensim.models.KeyedVectors.load_word2vec_format('data/nmf_module_vectors.txt')

In [101]:
module_keyed_vectors.most_similar('torch')

  return (m / dist).astype(REAL)


[('confidnet', 0.901949942111969),
 ('correlation_package', 0.8937965035438538),
 ('corenet', 0.8932680487632751),
 ('backbone', 0.8890403509140015),
 ('lnets', 0.8880048990249634),
 ('roi_heads', 0.8856840133666992),
 ('deepem', 0.8751028776168823),
 ('functions', 0.8749309182167053),
 ('slimcut', 0.8740584254264832),
 ('resblocks', 0.8708397150039673)]

In [102]:
module_keyed_vectors.most_similar('tensorflow')

[('planet', 0.936939001083374),
 ('tensor2tensor', 0.9350528120994568),
 ('fivo', 0.933452308177948),
 ('magenta', 0.9310630559921265),
 ('open_seq2seq', 0.9295987486839294),
 ('pointcnn', 0.9281453490257263),
 ('dragnn', 0.9206544756889343),
 ('task_adaptation', 0.9179682731628418),
 ('openseq2seq', 0.915902853012085),
 ('texar', 0.9105595946311951)]

In [103]:
module_keyed_vectors.most_similar('keras')

[('resnet50', 0.9902965426445007),
 ('crfnet', 0.9791890382766724),
 ('example_correctness_test_utils', 0.9787250757217407),
 ('dvrk', 0.9767580628395081),
 ('multi_sampler', 0.9744220972061157),
 ('fpn_network', 0.9725858569145203),
 ('pysts', 0.9711136817932129),
 ('other_utils', 0.9696723818778992),
 ('multi', 0.9658380150794983),
 ('mhp_loss', 0.9654721617698669)]

In [104]:
module_keyed_vectors.most_similar('sklearn')

[('test_base', 0.988355278968811),
 ('decisiontree', 0.9822568893432617),
 ('treeinterpreter', 0.9801574349403381),
 ('lale', 0.9794420599937439),
 ('modl', 0.9713461995124817),
 ('modal', 0.9607591032981873),
 ('tpot', 0.9551683068275452),
 ('tdparse', 0.9530482888221741),
 ('rllim', 0.9527280926704407),
 ('readdata', 0.9506838917732239)]

In [105]:
module_keyed_vectors.most_similar('os')

[('python_visual_mpc', 0.8991244435310364),
 ('simple_tokenizer', 0.8910014033317566),
 ('download_gdrive', 0.8768919110298157),
 ('awa_helper', 0.8731268644332886),
 ('netdef_slim', 0.8731244802474976),
 ('notebook_runner', 0.8731184601783752),
 ('register_coco', 0.8730090856552124),
 ('pycodestyle', 0.8702559471130371),
 ('fairseq_lr_scheduler', 0.8690776228904724),
 ('fairseq_optimizer', 0.8654027581214905)]

In [106]:
module_keyed_vectors.most_similar('sys')

[('pycorrector', 0.9400977492332458),
 ('pythainlp', 0.9391024112701416),
 ('qamodel', 0.9381730556488037),
 ('charsetprober', 0.9379682540893555),
 ('helperinclude', 0.9373955130577087),
 ('ansitowin32', 0.9370496869087219),
 ('new_tihtn_planner', 0.9369609951972961),
 ('blockworld', 0.9369609951972961),
 ('codingstatemachine', 0.9369609951972961),
 ('chardistribution', 0.9369609951972961)]