In [2]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz

In [3]:
# Extract top n words and full words from each file. Used for a stratified vocab and for documents to feed into tfidf
def extract_words(filename, N):
    print(f"Processing {filename}...")
    df = pd.read_csv(filename)
    # fill in missing values with an empty string
    df = df.fillna("")
    top_words = [word for word, count in Counter(" ".join(df["snippet"]).lower().split()).most_common(N)]
    words = df["snippet"].tolist()
    label = df["language"].tolist()
    return words, top_words, label

#files = ["snippets-bash.csv", "snippets-c.csv", "snippets-cpp.csv", "snippets-csv.csv", 
#         "snippets-dotfile.csv", "snippets-go.csv", "snippets-html.csv", "snippets-java.csv", "snippets-javascript.csv",
#         "snippets-json.csv"]
files = ["snippets-all.csv"]

# Loosely based off number of reserved keywords per language. Trying to upper bound to track most common keywords 
# and other common symbols in each language 
# https://stackoverflow.com/questions/4980766/reserved-keywords-count-by-programming-language
N = 200

vocabulary = []
snippets = []
labels = []
for file in files:
    snippet_list, vocab, label = extract_words(file, N)
    vocabulary += vocab
    snippets += snippet_list
    labels += label

vocabulary = list(set(vocabulary))

Processing snippets-all.csv...


In [4]:
print(snippets[:5])
print(vocabulary[:5])

['test/files/normalise.jpg.png\ntest/files/normalise-resized.jpg\npackage-lock.json\n/package.json\n*.mongodb\n', 'a computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays "Appropriate Legal Notices"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\n', 'the predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\n', '  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILI

In [5]:
vectorizer = TfidfVectorizer(vocabulary=vocabulary)
X = vectorizer.fit_transform(snippets)
# Lots of garbage in our features, e.g. '2', 'sherlock', '"",', 'carolina,south', '"仄仄平平仄，平平仄仄平"'
# Could be improved by creating smarter tokens, or engineering the vocabulary a bit more. 
print(vectorizer.get_feature_names())

[')', 'license.', 'new', 'function', 'will', 'your', 'test', 'in', '=>', 'license', '|', 'must', 'with', '0;', 'no', 'to', 'one', 'under', '})', '*/', '1', 'number', 'default', 'c', 'bool', 'source', '#define', '///', '<', 'end', 'name:', 'all', 'int', '2,', 'use', 'which', '0)', 'break;', '{', 'from', 'private', '/*', 'type:', 'when', 'or', '==', 'namespace', 'func', 'get', 'class', '()', 'should', '->', '+=', 'null,', 'we', '#include', '\\', 'false', 'typedef', 'see', 'the', '3', '?', 'list', 'msgid', '!=', 'null', 'software', 'export', '&', 'return', 'on', '/**', 'struct', '],', 'i', 'have', 'resolved', 'boolean', '||', 'for', 'nil', 'false;', 'a', 'may', 'data', 'char', '};', 'file', '0', '@param', 'without', 'err', 'is', '#:', 'version', 'at', '1,', 'value', 'an', 'const', 'and', 'set', '"author":', 'true;', 'true,', '},', '"type":', 'not', 'final', 'distributed', '"', 'void', '1;', ',', '@override', ':=', '&&', 'public', 'null)', 'can', 'code', 'module:', 'let', 'only', 'type', '

In [6]:
languages = list(set(labels))
encoding = { language: number for language, number in zip(languages, range(len(languages)))}
# One-hot encode the language
y = pd.get_dummies(map(lambda x: encoding[x], labels))
print(X)
print(y)

print(X.shape[0])
print(len(y))

  (0, 6)	1.0
  (1, 186)	0.44328038094254124
  (1, 160)	0.2605086464445217
  (1, 159)	0.22133967429620074
  (1, 140)	0.1744958356981105
  (1, 137)	0.2746496819198951
  (1, 109)	0.20601576540346922
  (1, 102)	0.37632548332860516
  (1, 100)	0.45693483173087457
  (1, 94)	0.17770093565570846
  (1, 61)	0.1448822384138086
  (1, 15)	0.16341492692682041
  (1, 14)	0.2540222823849807
  (1, 12)	0.21002905477423364
  (2, 181)	0.17987030893528658
  (2, 180)	0.31237124472408573
  (2, 177)	0.11086403613928053
  (2, 159)	0.28709925810907
  (2, 144)	0.15657807047140845
  (2, 140)	0.22633820679164887
  (2, 121)	0.16029909004883336
  (2, 109)	0.2672226458911143
  (2, 85)	0.34535837350404613
  (2, 81)	0.22386171309966488
  (2, 72)	0.14513948997563972
  :	:
  (4849923, 109)	0.41871786947058554
  (4849923, 89)	0.9081163723807725
  (4849925, 109)	0.9690864324161217
  (4849925, 49)	0.24672147556909907
  (4849926, 109)	0.9690864324161217
  (4849926, 49)	0.24672147556909907
  (4849928, 89)	1.0
  (4849931, 109)	0

In [7]:
# Save data in csv files
save_npz("X", X)
y.to_pickle("y.pcl")