In [1]:
import numpy as np
import os
import re

from collections import defaultdict

## Programming Language Identification
Uses a character-level recurrent convolutional model to identify the programming languages of lines of code.

In [9]:
# Config

PYTHON = 0
JAVA = 1
C = 2
CPP = 3

# Languages to use for the identification task.
# Note that Python vs. C++ is a fairly easy task.
LANGS_TO_USE = [PYTHON, CPP]

# Repos containing large amounts of code for each language
CODE_DIRS = {
    PYTHON: "./data/django",
    JAVA: "./data/ohc",
    C: "./data/freebsd",
    CPP: "./data/caffe"
}

# Filename extensions for each language
EXTS = {
    PYTHON: {"py"},
    JAVA: {"java"},
    C: {"c", "h"},
    CPP: {"cpp", "cc"}
}

# Maximum number of characters for each language
N_CHARS = {
    PYTHON: 1000000,
    JAVA: 1000000,
    C: 1000000,
    CPP: 1000000
}

# Full names of languages; used for printing
LANG_NAMES = {
    PYTHON: "Python",
    JAVA: "Java",
    C: "C",
    CPP: "C++"
}

MAX_LINES = 20000  # Max lines across all languages
LENGTH_PERCENTILE = 95  # Percentile of the line lengths to use as the max line length

In [10]:
def getAllCodeByExt(path, extensions, maxChars=float("inf")):
    """Given a set of extensions, returns a string consisting of
    at most maxChars characters from the concatenation of all files
    under the given path with any of the given extensions."""
    result = ""
    for name in os.listdir(path):
        newPath = os.path.join(path, name)
        if os.path.isfile(newPath):
            _, ext = os.path.splitext(newPath)
            if ext[1:] in extensions:
                with open(newPath) as f:
                    result += f.read()
        else:
            result += getAllCodeByExt(newPath, extensions)
        if len(result) > maxChars:
            result = result[:maxChars]
            return result
    return result

# Code preprocessors

def preprocessPythonCode(code):
    code = re.sub(r"\s*\n", "\n", code)
    code = re.sub(r"\"\"\"(.+?)\"\"\"", "<comment>", code, flags=re.DOTALL)
    code = re.sub(r"(\#(.+?)\n)+", "<comment>\n", code)
    code = re.sub(r"\"(.+?)\"|\'(.+?)\'", "<string>", code)
    code = re.sub(r"\d", "#", code)
    return code

def preprocessCLikeCode(code):
    code = re.sub(r"\s*\n", "\n", code)
    code = re.sub(r"/\*(.+?)\*/", "<comment>", code, flags=re.DOTALL)
    code = re.sub(r"//(.+?)\n", "<comment>\n", code)
    code = re.sub(r"\#(.+?)\n", "<macro>\n", code)
    code = re.sub(r"\"(.+?)\"", "<string>", code)
    code = re.sub(r"\'(.+?)\'", "<char>", code)
    code = re.sub(r"\d", "#", code)
    return code

preprocessors = {
    PYTHON: preprocessPythonCode,
    JAVA: preprocessCLikeCode,
    C: preprocessCLikeCode,
    CPP: preprocessCLikeCode
}

# Code retrieval

def getCode(language, preprocess=False):
    """Return all code for the specified language, optionally preprocessed."""
    code = getAllCodeByExt(CODE_DIRS[language], EXTS[language], N_CHARS[language])
    return preprocessors[language](code) if preprocess else code

# Data processing

def lineToOneHot(line, indexing):
    """Given a line of text and an indexing from characters to indices,
    returns a sequence of one-hot vectors encoding the line."""
    return np.array([[int(i == indexing[c]) for i in range(nChars)] for c in line])

def oneHotToLine(oneHot, invIndexing):
    """Given a one-hot vector sequence and an inverse indexing from indices to
    characters, returns the line of text encoded by the sequence."""
    return "".join([invIndexing[val] for val in oneHot.argmax(1)])

def cropToLength(seq, length):
    """Crops seq to the specified length or pads it with zeros to the
    specified length. No change if seq.shape[0] == length."""
    if seq.shape[0] == length:
        return seq
    elif seq.shape[0] > length:
        return seq[:length, :]
    else:
        neededPad = length - seq.shape[0]
        return np.pad(seq, [(0, neededPad), (0, 0)], "constant")

# Model utilities
    
def queryModel(model, line, indexing):
    """Given a Keras model, a line, and a char-index indexing, determine the language
    scores for the line."""
    seq = cropToLength(lineToOneHot(line, indexing), targetLength)
    output = model.predict(np.array([seq]))[0]
    scores = [(LANG_NAMES[lang], output[idx]) for idx, lang in enumerate(LANGS_TO_USE)]
    sortedScores = sorted(scores, key=lambda (lang, score): -score)
    return sortedScores

In [11]:
# Set up and analyze codebases

# Set preprocess=True in getCode to preprocess the code
codeBases = {lang:getCode(lang) for lang in LANGS_TO_USE}
nLangs = len(LANGS_TO_USE)

allChars = reduce(lambda x, y: set(x) | set(y), codeBases.values())
nChars = len(allChars)
print "Using %d chars." % nChars

indexing = {c:idx for idx, c in enumerate(allChars)}
invIndexing = {idx:c for c, idx in indexing.iteritems()}

# Construct data arrays

XList, yList = [], []
for idx, lang in enumerate(LANGS_TO_USE):
    labelVec = np.array([int(i == idx) for i in range(nLangs)])
    text = codeBases[lang]
    nLines = 0
    for line in text.split("\n"):
        lineVec = lineToOneHot(line, indexing)
        if lineVec.size > 0:
            XList.append(lineVec)
            yList.append(labelVec)
            nLines += 1
        if nLines == MAX_LINES:
            break
    print "Using %d lines of %s." % (nLines, LANG_NAMES[lang])
    
print "Total: %d lines from %d codebases." % (len(XList), nLangs)

lengths = [vec.shape[0] for vec in XList]
targetLength = int(np.percentile(lengths, LENGTH_PERCENTILE))
print "Using %dth percentile line length = %d" % (LENGTH_PERCENTILE, targetLength)
XList = [cropToLength(vec, targetLength) for vec in XList]

X = np.array(XList)
y = np.array(yList)

Using 185 chars.
Using 20000 lines of Python.
Using 20000 lines of C++.
Total: 40000 lines from 2 codebases.
Using 95th percentile line length = 77


In [14]:
# Model setup, training, and eval

from keras.models import Sequential
from keras.layers import Dense, Convolution1D, MaxPooling1D, LSTM, Dropout
from keras.optimizers import Adam, RMSprop

model = Sequential([
        # This layer should extract language-specific character
        # sequences, like "class" or "import"
        Convolution1D(64, 7, input_shape=(targetLength, nChars), activation="relu"),
        MaxPooling1D(pool_length=7),
        Dropout(0.5),
        # This LSTM doesn't need to be bidirectional because we don't expect 
        # character ordering to have any semantic value
        LSTM(256),
        Dropout(0.5),
        Dense(128, activation="relu"),
        Dense(nLangs, activation="softmax"),
    ])

optimizer = Adam()
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
model.fit(X, y, nb_epoch=3, validation_split=0.1)

Train on 36000 samples, validate on 4000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x231ad7950>

In [21]:
# Prints lines in a file, along with the model's predicted language and confidence level.

testCodePath = "./data/test_code/pythonTest.py"

testCode = open(testCodePath).read()
testLines = testCode.split("\n")
maxLength = max([len(line) for line in testLines])
for line in testLines:
    if len(line) == 0:
        print ""
    else:
        scores = queryModel(model, line, indexing)
        bestLang, bestScore = scores[0]
        print "%-*s%s with %.02f confidence" % (maxLength, line, bestLang, 100 * bestScore)

#!/usr/bin/env python                                                                   Python with 99.94 confidence
"""                                                                                     Python with 99.91 confidence
classify.py is an out-of-the-box image classifer callable from the command line.        Python with 99.98 confidence

By default it configures and runs the Caffe reference ImageNet model.                   Python with 99.82 confidence
"""                                                                                     Python with 99.91 confidence
import numpy as np                                                                      Python with 99.97 confidence
import os                                                                               Python with 99.99 confidence
import sys                                                                              Python with 99.99 confidence
import argparse                                                