In [11]:
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
# Extract top n words and full words from each file. Used for a stratified vocab and for documents to feed into tfidf
def extract_words(filename, N):
    print(f"Processing {filename}...")
    df = pd.read_csv(filename)
    # fill in missing values with an empty string
    df = df.fillna("")
    top_words = [word for word, count in Counter(" ".join(df["snippet"]).lower().split()).most_common(N)]
    words = df["snippet"].tolist()
    label = df["language"].tolist()
    return words, top_words, label

files = ["snippets-bash.csv", "snippets-c.csv", "snippets-cpp.csv", "snippets-csv.csv", 
         "snippets-dotfile.csv", "snippets-go.csv", "snippets-html.csv", "snippets-java.csv", "snippets-javascript.csv",
         "snippets-json.csv"]

# Loosely based off number of reserved keywords per language. Trying to upper bound to track most common keywords 
# and other common symbols in each language 
# https://stackoverflow.com/questions/4980766/reserved-keywords-count-by-programming-language
N = 200

vocabulary = []
snippets = []
labels = []
for file in files:
    snippet_list, vocab, label = extract_words(file, N)
    vocabulary += vocab
    snippets += snippet_list
    labels += label

vocabulary = list(set(vocabulary))

Processing snippets-bash.csv...
Processing snippets-c.csv...
Processing snippets-cpp.csv...
Processing snippets-csv.csv...
Processing snippets-dotfile.csv...
Processing snippets-go.csv...
Processing snippets-html.csv...
Processing snippets-java.csv...
Processing snippets-javascript.csv...
Processing snippets-json.csv...


In [39]:
print(snippets[:5])
print(vocabulary[:5])

['if [ "x${ghprbPullId}" == "x" ]\nthen\n\texit 1\nfi\n\n', 'rm -f ../arduino-*.tar.xz\nrm -f ../arduino-*.zip\n\nant -Djava.net.preferIPv4Stack=true -Dplatform=linux32 $@ clean dist\nmv linux/arduino-*-linux32.tar.xz ../\n', '        COMPREPLY=($(compgen -W "pre un acc accepted unaccepted rej rejected all" -- ${cur}))\n        return 0\n     ;;\n     --accept-all)\n        return 0\n', '          --out=pprint --out=yaml --out=overstatestage --out=json --out=raw \\\n          --out=highstate --out=key --out=txt --no-color --out-indent= "\n    if [ ${COMP_CWORD} -gt 2 ]; then\n        pprev="${COMP_WORDS[COMP_CWORD-2]}"\n    fi\n', '    fi\n    if [ "${prev}" = "=" ] && [[ ${pprev} == --* ]]; then\n       prev="${pprev}"\n    fi\n\n']
['root', 'apilevel-"', 'boolean', 'of', '[],']


In [40]:
vectorizer = TfidfVectorizer(vocabulary=vocabulary)
X = vectorizer.fit_transform(snippets)
# Lots of garbage in our features, e.g. '2', 'sherlock', '"",', 'carolina,south', '"仄仄平平仄，平平仄仄平"'
# Could be improved by creating smarter tokens, or engineering the vocabulary a bit more. 
print(vectorizer.get_feature_names())

['root',
 'apilevel-"',
 'boolean',
 'of',
 '[],',
 '</a>',
 "edit.\\n'",
 '2',
 '<script>',
 'linguist-generated=true',
 '0x80',
 'runquery("(i',
 'void',
 'width:',
 'tr',
 '"readonly":',
 '<p',
 'background-color:',
 '<tr',
 '|',
 'castles',
 'compreply=(',
 'sherlock',
 '"dark_plus":',
 'compiled',
 '16,',
 'few',
 'type="text/css"',
 'use',
 'static',
 '2,',
 'foo/baz/qux,',
 'except',
 '/>',
 'imprisonment;',
 '1)',
 "don't",
 'entropy',
 'gradle',
 '"",',
 'dependencies',
 '</span><span',
 'github,',
 'favorably,somewhat',
 'std::cout',
 '=',
 'nullptr;',
 'then',
 '</div>',
 '@returns',
 '(const',
 'config',
 '<li',
 'notice',
 'apilevel-">',
 '$fname',
 '0x65,',
 '"images":',
 '"string",',
 '0.000,',
 'yarn-error.log*',
 'goto',
 '1b',
 '__gitcomp',
 'case',
 'generating',
 'should',
 'charset="utf-8">',
 'make',
 'carolina,south',
 'uint32',
 'class="el"',
 'public',
 '"^2.0.0"',
 'windows',
 'be',
 'into',
 '*.log',
 '"identifier",',
 '@test',
 '#',
 'check',
 '<input',
 '"g

In [None]:
languages = list(set(labels))
encoding = { language: number for language, number in zip(languages, range(len(languages)))}