In [1]:
import numpy as np
import glob
import csv
import re
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

## Reading in Data
Used glob to read a bunch of files with a similar path according to file extension. This allowed me to get just the languages I wanted in my data set. 

In [2]:
def read_prog_files(loc):
    files = glob.glob(loc, recursive=True)
    texts = []
    for file in files:
        with open(file) as f:
            texts.append(f.read())
    return texts

Read in each type of file and concatenated data into x and y lists.

In [3]:
file_extensions = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'java', 'javascript', 'ocaml', 'perl', 'hack', 'php', 'python3', 'jruby', 'yarv', 'scala', 'racket']
ext_dict = {'jruby': 'ruby', 'csharp': 'c#', 'hack': 'php', 'sbcl': 'common lisp', 'ocaml': 'ocaml', 'python3': 'python', 'php': 'php', 'perl': 'perl', 'racket': 'scheme', 'c': 'c', 'javascript': 'javascript', 'gcc': 'c', 'yarv': 'ruby', 'java': 'java', 'clojure': 'clojure', 'scala': 'scala'}
X = []
y = []

for ext in file_extensions:
    x_texts = read_prog_files('/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/**/*.{}'.format(ext))
    X += x_texts
    y += (len(x_texts) * [ext_dict[ext]])
    
print(len(X), len(y))
len(set(y))

552 552


13

## Train-Test Split
Used the train_test_split method from sklearn to split data set into 60/40 for training and testing the classifier.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, train_size=0.6, random_state=890)
len(X_train)

331

I wanted to includ the one '.c' file in my training data, not my test data, so I made sure the length of my training data was 16.

In [5]:
len(np.unique(y_train))

13

## Feature Extraction: Count Vectorizer
I used scikit-learn's count vectorizer to extract features from the data. I wanted words, white spaces, and puctuaion, but not numbers, which I assumed would be less language specific and more project specific. There are a lot of features in this model.

In [6]:
cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]+')
cv.fit(X_train)
cv.transform(X_train)
print(len(cv.get_feature_names()))

5860


## Naive Bayes Classifier

In [7]:
baye_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
                      ('classifier', MultinomialNB())])

baye_pipe.fit(X_train, y_train)
baye_pipe.named_steps['vectorizer'].transform(X_train)

baye_pipe.score(X_test, y_test)

0.92760180995475117

## Decision Tree Classifier

In [8]:
tree_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
                      ('transformer', TfidfTransformer()),
                      ('classifier', DecisionTreeClassifier())])

tree_pipe.fit(X_train, y_train)
tree_pipe.named_steps['vectorizer'].transform(X_train)

print(tree_pipe.score(X_train, y_train))
print(tree_pipe.score(X_test, y_test))

1.0
0.93665158371


## Random Forest Classifier
This meta estimator performs worse than the single decision tree classifier.

In [9]:
forest_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
#                       ('transformer', TfidfTransformer()),
                        ('classifier', RandomForestClassifier())])

forest_pipe.fit(X_train, y_train)
forest_pipe.named_steps['vectorizer'].transform(X_train)

print(forest_pipe.score(X_train, y_train))
print(forest_pipe.score(X_test, y_test))

1.0
0.963800904977


## Testing
I read in the test files using the same function used earlier. All three classifiers performed poorly on the test data.

In [10]:
testy_X = read_prog_files('/Users/kathrynjackson/Code/homework/assignments-master/week5/polyglot/test/*')
testy_y = []
with open('/Users/kathrynjackson/Code/homework/assignments-master/week5/polyglot/test.csv') as test_targets:
    lines = csv.reader(test_targets)
    for line in lines:
        testy_y.append(line[1])


In [11]:
print(tree_pipe.score(testy_X, testy_y))
print(tree_pipe.predict(testy_X))

0.03125
['common lisp' 'javascript' 'javascript' 'javascript' 'ruby' 'ruby' 'ruby'
 'java' 'javascript' 'php' 'common lisp' 'common lisp' 'common lisp'
 'common lisp' 'java' 'java' 'php' 'scala' 'php' 'php' 'php' 'php'
 'common lisp' 'php' 'ocaml' 'ocaml' 'common lisp' 'python' 'python'
 'common lisp' 'python' 'common lisp']


In [12]:
print(forest_pipe.score(testy_X, testy_y))
print(forest_pipe.predict(testy_X))

0.0625
['clojure' 'javascript' 'javascript' 'ruby' 'ruby' 'ruby' 'ruby' 'c'
 'python' 'ruby' 'javascript' 'python' 'python' 'scheme' 'c#' 'c#' 'ruby'
 'java' 'c' 'javascript' 'javascript' 'python' 'javascript' 'javascript'
 'ocaml' 'ruby' 'ruby' 'python' 'python' 'javascript' 'c#' 'javascript']


In [13]:
print(baye_pipe.score(testy_X, testy_y))
print(baye_pipe.predict(testy_X))

0.03125
['clojure' 'javascript' 'scala' 'scala' 'ruby' 'ruby' 'ruby' 'java' 'scala'
 'scala' 'scheme' 'clojure' 'scheme' 'scheme' 'c' 'c' 'scala' 'scala' 'php'
 'php' 'c' 'php' 'clojure' 'php' 'ocaml' 'ocaml' 'java' 'python' 'python'
 'python' 'python' 'javascript']


In [14]:
print(testy_y)

['clojure', 'clojure', 'clojure', 'clojure', 'python', 'python', 'python', 'python', 'javascript', 'javascript', 'javascript', 'javascript', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'scheme', 'scheme', 'scheme', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']


## Build Your Own
I used several methods to try to improve the classifier, but it's still not very good. First, I built a custom featurizer using the following functions. I ended up writing a function that takes a regular expression so that I could try different things quickly.

In [15]:
def caps_to_non(text):
    cap_letters = re.findall(r'[A-Z]', text)
    non_caps = re.findall(r'[a-z]', text)
    return len(cap_letters) / len(non_caps)
    
    
def percent_occurence_of_parenthesis(text):
    pars = re.findall(r'\(|\)', text)
    return len(pars) / len(text)


def percent_occurence_of_curly(text):
    curls = re.findall(r'\{|\}', text)
    return len(curls) / len(text)


def percent_occurence_of_space(text):
    spaces = re.findall(r'\s', text)
    return len(spaces) / len(text)

    
def occurence_of_this_pattern(reg_ex):
    
    def feature_fn(text):
        occ = re.findall(r'{}'.format(reg_ex), text)
        return len(occ)

    return feature_fn


class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        fvs = []
        for text in X:
            fv = [f(text) for f in self.featurizers]
            fvs.append(fv)
        return np.array(fvs)

After instantiating my featurizer class, I combined it with the CountVectorizer using the sklearn class FeatureUnion. I passed a regular expression to the count vectorizer that tokenizes words, all kinds of white space, and different punctuation. I am using the decision tree classifier, which scored highest in my previous trials, in addition to the linear support vector classifier. Using the TfidfTransformer seems to make the score worse.<br>
<br>
Without random state argurments, the resulting classifier predicts correctly anywhere from 3.5% to 15.6% of the time, but usually hits between 9% - 12.5%. I don't have any other ideas for how to improve it.

In [16]:
featurizer = FunctionFeaturizer(caps_to_non,
                                percent_occurence_of_parenthesis,
                                percent_occurence_of_curly,
                                percent_occurence_of_space,
                                occurence_of_this_pattern('&\w'),
                                occurence_of_this_pattern('\$\w'),
                                occurence_of_this_pattern('[A-Za-z]+[A-Z]'))

cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]', lowercase=False)

feature_extractors = FeatureUnion([('my featurizer', featurizer), ('cv', cv)])

my_classifier = Pipeline([
                    ('featurizer', feature_extractors),
#                     ('transformer', TfidfTransformer()),
                    ('classifier', DecisionTreeClassifier(criterion='entropy', min_samples_split=1, random_state=1067)),
                    ('linsvc', LinearSVC(random_state=13)),
                    ])

my_classifier.fit(X_train, y_train)
my_classifier.named_steps['featurizer'].transform(X_train)
print(my_classifier.score(X_test, y_test))

print(my_classifier.score(testy_X, testy_y))
print(my_classifier.predict(testy_X))



0.954751131222
0.125
['clojure' 'javascript' 'scala' 'ruby' 'ruby' 'ruby' 'javascript'
 'common lisp' 'javascript' 'javascript' 'javascript' 'scala' 'javascript'
 'ocaml' 'javascript' 'javascript' 'scala' 'scala' 'php' 'ruby' 'php'
 'ruby' 'ruby' 'php' 'ocaml' 'javascript' 'javascript' 'javascript' 'ruby'
 'javascript' 'ruby' 'javascript']




In [17]:
print(testy_y)

['clojure', 'clojure', 'clojure', 'clojure', 'python', 'python', 'python', 'python', 'javascript', 'javascript', 'javascript', 'javascript', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'scheme', 'scheme', 'scheme', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']


In [18]:
print(classification_report(my_classifier.predict(testy_X), testy_y))
print(confusion_matrix(testy_y, my_classifier.predict(testy_X)))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

    clojure       0.25      1.00      0.40         1
common lisp       0.00      0.00      0.00         1
    haskell       0.00      0.00      0.00         0
       java       0.00      0.00      0.00         0
 javascript       0.75      0.23      0.35        13
      ocaml       0.00      0.00      0.00         2
        php       0.00      0.00      0.00         3
     python       0.00      0.00      0.00         0
       ruby       0.00      0.00      0.00         8
      scala       0.00      0.00      0.00         4
     scheme       0.00      0.00      0.00         0
        tcl       0.00      0.00      0.00         0

avg / total       0.31      0.12      0.16        32

[[1 0 0 0 1 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 2 0 0]
 [0 0 0 0 0 0 0 0 2 0 0 0]
 [0 0 0 0 3 0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 1 0 0 0]
 [0 0 0 0 2 0 0 0 1 0 0 0]
 [0 1 0 0 1 0 0 0 2 0 0 0]
 [0 0 0 0 2 1 0 0 0 0 0 0]
 [0 0 0 0 0 



The commented code is full of unsuccessful attempts to modify the CountVectorizer. I tried passing my own specific vocabulary. I tried different regular expressions. I tried analizing 2 and 3 character n-grams rather than words. None of these methods made a significant difference.

In [19]:
# my_vocab = ['function', '{', '}', '\n', '\t', ':', ';', 'def', ',', '->',
#             '(', ')', 'call', 'lambda', 'set', '@', '>', '<', '.', '[',
#             ']', 'var', 'elif', 'else', 'else if', 'then', 'in',
#             'switch', 'IfTrue', 'IfFalse', 'unless', 'not', 'elsif',
#             'given', 'end', 'match', '(if', '(otherwise', 'progn', 'begin',
#             'cond', 'then begin', 'with', 'when', 'foreach', 'for each',
#             'for_each', 'for (', '$i++', '$i', '$', 'do', 'until', 'loop',
#             'let loop', 'for-each', 'done', '.iter', 'catch', 'except',
#             'longjmp', 'setjmp', 'finally', 'throw', 'die', 'eval', '$@',
#             'rescue', 'ensure', 'handler-', 'check-', 'guard', 'try:',
#             'catchError', 'last', 'break', 'return-from',
#             'loop-finish', 'go', 'goto', 'next', 'func', 'void', 'int main',
#             'main', 'public', 'defun', 'setf', 'define', '&', '*', '/',
#             'require', ' = ', 'import', '__init__']
# cv = CountVectorizer(vocabulary=my_vocab)

# cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]', lowercase=False)
# cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|[^\w\d\s]+')

# cv = CountVectorizer(analyzer='char', ngram_range=(2,3))

# cv = CountVectorizer(lowercase=False)


In [20]:
def language_guesser(snippet):
    return my_classifier.predict([snippet])

In [21]:
language_guesser('''def an_imaginary_function:\n    return dict = {'a': 'B'}''')



array(['javascript'], 
      dtype='<U11')