In [1]:
import numpy as np
import glob
import csv
import re
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

## Reading in Data
Used glob to read a bunch of files with a similar path according to file extension. This allowed me to get just the languages I wanted in my data set. 

In [2]:
def read_prog_files(loc):
    files = glob.glob(loc, recursive=True)
    texts = []
    for file in files:
        with open(file) as f:
            texts.append(f.read())
    return texts

Read in each type of file and concatenated data into x and y lists.

In [17]:
file_extensions = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'java', 'javascript', 'ocaml', 'perl', 'hack', 'php', 'python3', 'jruby', 'yarv', 'scala', 'racket']
language_by_extension = {
    'jruby': 'ruby',
    'csharp': 'c#',
    'hack': 'php',
    'sbcl': 'common lisp',
    'ocaml': 'ocaml',
    'python3': 'python',
    'php': 'php',
    'perl': 'perl',
    'racket': 'scheme',
    'c': 'c',
    'javascript': 'javascript',
    'gcc': 'c',
    'yarv': 'ruby',
    'java': 'java',
    'clojure': 'clojure',
    'scala': 'scala'
}
X = []
y = []

for ext in file_extensions:
    x_texts = read_prog_files('/Users/kathrynjackson/Code/iron-yard/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/**/*.{}'.format(ext))
    X += x_texts
    y += (len(x_texts) * [ext_dict[ext]])
    print("{} texts with file extension {}: {}".format(language_by_extension[ext], ext, len(x_texts)))

print("\n")
print("number of texts", len(X))
print("number of targets", len(y))
print("number of potential targets", len(set(y)))

c texts with file extension gcc: 58
c texts with file extension c: 1
c# texts with file extension csharp: 41
common lisp texts with file extension sbcl: 34
clojure texts with file extension clojure: 38
java texts with file extension java: 51
javascript texts with file extension javascript: 25
ocaml texts with file extension ocaml: 34
perl texts with file extension perl: 34
php texts with file extension hack: 26
php texts with file extension php: 29
python texts with file extension python3: 36
ruby texts with file extension jruby: 34
ruby texts with file extension yarv: 39
scala texts with file extension scala: 43
scheme texts with file extension racket: 29


number of texts 552
number of targets 552
number of potential targets 13


## Train-Test Split
Used the train_test_split method from sklearn to split data set into 60/40 for training and testing the classifier.

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, train_size=0.6, random_state=890)

## Feature Extraction: Count Vectorizer
I used scikit-learn's count vectorizer to extract features from the data. I wanted words, white spaces, and puctuaion, but not numbers, which I assumed would be less language specific and more project specific. There are a lot of features in this model. I trained several models with this set of features. Each model's score represents its mean accuracy.

In [44]:
cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]+')
cv.fit(X_train)
cv.transform(X_train)
print("number of features: ", len(cv.get_feature_names()))

number of features:  5860


### Naive Bayes Classifier

In [49]:
baye_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
                      ('classifier', MultinomialNB())])

baye_pipe.fit(X_train, y_train)
baye_pipe.named_steps['vectorizer'].transform(X_train)

score = baye_pipe.score(X_test, y_test)
print("mean accuracy: ", score)

mean accuracy:  0.927601809955


### Decision Tree Classifier
The accuracy of this model varies, but stays around 90%.

In [58]:
tree_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
                      ('transformer', TfidfTransformer()),
                      ('classifier', DecisionTreeClassifier())])

tree_pipe.fit(X_train, y_train)
tree_pipe.named_steps['vectorizer'].transform(X_train)

score = tree_pipe.score(X_test, y_test)
print("mean accuracy: ", score)

mean accuracy:  0.89592760181


### Random Forest Classifier
This meta estimator is the most accurate I tested.

In [61]:
forest_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
#                       ('transformer', TfidfTransformer()),
                        ('classifier', RandomForestClassifier())])

forest_pipe.fit(X_train, y_train)
forest_pipe.named_steps['vectorizer'].transform(X_train)

score = forest_pipe.score(X_test, y_test)
print("mean accuracy: ", score)

mean accuracy:  0.972850678733


## Testing
I read in the test files using the same function used earlier. All three classifiers performed poorly on the test data. The model that performed the best in training performed the worst with the testing data.

In [87]:
# test files stored in test_X
test_X = []
test_files = list(range(1,33))

for file in test_files:
    test = read_prog_files('/Users/kathrynjackson/Code/iron-yard/homework/assignments-master/week5/polyglot/test/{}'.format(file))
    test_X += test

# test targets stored in test_y
test_y = []
with open('/Users/kathrynjackson/Code/iron-yard/homework/assignments-master/week5/polyglot/test.csv') as test_targets:
    lines = csv.reader(test_targets)
    for line in lines:
        test_y.append(line[1])

### Test: Decision Tree 

In [88]:
score = tree_pipe.score(testy_X, testy_y)
prediction = tree_pipe.predict(testy_X)

print("mean accuracy: ", score, "\n")
print("PREDICTION :: ACTUAL")
for pair in zip(prediction, test_y):
    print("{} :: {}".format(pair[0], pair [1]))

mean accuracy:  0.5625 

PREDICTION :: ACTUAL
scheme :: clojure
javascript :: clojure
javascript :: clojure
scheme :: clojure
python :: python
python :: python
javascript :: python
python :: python
javascript :: javascript
javascript :: javascript
javascript :: javascript
javascript :: javascript
ruby :: ruby
ruby :: ruby
ruby :: ruby
java :: haskell
javascript :: haskell
php :: haskell
javascript :: scheme
javascript :: scheme
scheme :: scheme
java :: java
java :: java
php :: scala
javascript :: scala
php :: tcl
php :: tcl
php :: php
php :: php
php :: php
ocaml :: ocaml
ocaml :: ocaml


### Test: Random Forest

In [89]:
score = forest_pipe.score(testy_X, testy_y)
prediction = forest_pipe.predict(testy_X)

print("mean accuracy: ", score, "\n")
print("PREDICTION :: ACTUAL")
for pair in zip(prediction, test_y):
    print("{} :: {}".format(pair[0], pair [1]))

mean accuracy:  0.28125 

PREDICTION :: ACTUAL
ocaml :: clojure
ruby :: clojure
ruby :: clojure
ruby :: clojure
ocaml :: python
python :: python
ruby :: python
ruby :: python
javascript :: javascript
ruby :: javascript
scala :: javascript
javascript :: javascript
ruby :: ruby
ruby :: ruby
ruby :: ruby
ruby :: haskell
ruby :: haskell
perl :: haskell
javascript :: scheme
javascript :: scheme
common lisp :: scheme
javascript :: java
javascript :: java
javascript :: scala
scala :: scala
php :: tcl
javascript :: tcl
php :: php
javascript :: php
javascript :: php
ocaml :: ocaml
javascript :: ocaml


### Test: Naive Bayes

In [90]:
score = baye_pipe.score(testy_X, testy_y)
prediction = baye_pipe.predict(testy_X)

print("mean accuracy: ", score, "\n")
print("PREDICTION :: ACTUAL")
for pair in zip(prediction, test_y):
    print("{} :: {}".format(pair[0], pair [1]))

mean accuracy:  0.65625 

PREDICTION :: ACTUAL
clojure :: clojure
clojure :: clojure
clojure :: clojure
java :: clojure
python :: python
python :: python
python :: python
python :: python
javascript :: javascript
javascript :: javascript
scala :: javascript
scala :: javascript
ruby :: ruby
ruby :: ruby
ruby :: ruby
java :: haskell
scala :: haskell
scala :: haskell
scheme :: scheme
scheme :: scheme
scheme :: scheme
c :: java
c :: java
scala :: scala
scala :: scala
php :: tcl
php :: tcl
c :: php
php :: php
php :: php
ocaml :: ocaml
ocaml :: ocaml


## Build Your Own
I used several methods to try to improve the classifier, but it's still not very good. First, I built a custom featurizer using the following functions. I ended up writing a function that takes a regular expression so that I could try different things quickly.

In [91]:
def caps_to_non(text):
    cap_letters = re.findall(r'[A-Z]', text)
    non_caps = re.findall(r'[a-z]', text)
    return len(cap_letters) / len(non_caps)
    
    
def percent_occurence_of_parenthesis(text):
    pars = re.findall(r'\(|\)', text)
    return len(pars) / len(text)


def percent_occurence_of_curly(text):
    curls = re.findall(r'\{|\}', text)
    return len(curls) / len(text)


def percent_occurence_of_space(text):
    spaces = re.findall(r'\s', text)
    return len(spaces) / len(text)

    
def occurence_of_this_pattern(reg_ex):
    
    def feature_fn(text):
        occ = re.findall(r'{}'.format(reg_ex), text)
        return len(occ)

    return feature_fn


class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        fvs = []
        for text in X:
            fv = [f(text) for f in self.featurizers]
            fvs.append(fv)
        return np.array(fvs)

After instantiating my featurizer class, I combined it with the CountVectorizer using the sklearn class FeatureUnion. I passed a regular expression to the count vectorizer that tokenizes words, all kinds of white space, and different punctuation. I am using the decision tree classifier, which scored highest in my previous trials, in addition to the linear support vector classifier. Using the TfidfTransformer seems to make the score worse.<br>
<br>
Without random state argurments, the resulting classifier predicts correctly anywhere from 3.5% to 15.6% of the time, but usually hits between 9% - 12.5%. I don't have any other ideas for how to improve it.

In [97]:
featurizer = FunctionFeaturizer(caps_to_non,
                                percent_occurence_of_parenthesis,
                                percent_occurence_of_curly,
                                percent_occurence_of_space,
                                occurence_of_this_pattern('&\w'),
                                occurence_of_this_pattern('\$\w'),
                                occurence_of_this_pattern('[A-Za-z]+[A-Z]'))

cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]', lowercase=False)

feature_extractors = FeatureUnion([('my featurizer', featurizer), ('cv', cv)])

my_classifier = Pipeline([
                    ('featurizer', feature_extractors),
                    ('transformer', TfidfTransformer()),
#                     ('classifier', DecisionTreeClassifier(random_state=1067)),
                    ('linsvc', LinearSVC(random_state=13)),
                    ])

my_classifier.fit(X_train, y_train)
my_classifier.named_steps['featurizer'].transform(X_train)
train_score = my_classifier.score(X_test, y_test)

test_score = my_classifier.score(testy_X, testy_y)
prediction = my_classifier.predict(testy_X)

print("TRAINING")
print("mean accuracy: ", train_score, "\n")
print("TESTING")
print("mean accuracy: ", test_score, "\n")
print("PREDICTION :: ACTUAL")
for pair in zip(prediction, test_y):
    print("{} :: {}".format(pair[0], pair [1]))

TRAINING
mean accuracy:  0.868778280543 

TESTING
mean accuracy:  0.5625 

PREDICTION :: ACTUAL
clojure :: clojure
clojure :: clojure
clojure :: clojure
java :: clojure
ruby :: python
python :: python
python :: python
python :: python
javascript :: javascript
javascript :: javascript
scala :: javascript
scala :: javascript
ruby :: ruby
ruby :: ruby
ruby :: ruby
java :: haskell
clojure :: haskell
scala :: haskell
common lisp :: scheme
scheme :: scheme
scheme :: scheme
c :: java
c :: java
scala :: scala
scala :: scala
php :: tcl
php :: tcl
c :: php
php :: php
php :: php
c# :: ocaml
ocaml :: ocaml


In [47]:
print(classification_report(my_classifier.predict(testy_X), testy_y))
print(confusion_matrix(testy_y, my_classifier.predict(testy_X)))

             precision    recall  f1-score   support

          c       0.00      0.00      0.00         3
         c#       0.00      0.00      0.00         1
    clojure       0.75      0.75      0.75         4
common lisp       0.00      0.00      0.00         1
    haskell       0.00      0.00      0.00         0
       java       0.00      0.00      0.00         2
 javascript       0.50      1.00      0.67         2
      ocaml       0.50      1.00      0.67         1
        php       0.67      0.50      0.57         4
     python       0.75      1.00      0.86         3
       ruby       1.00      0.75      0.86         4
      scala       1.00      0.40      0.57         5
     scheme       0.67      1.00      0.80         2
        tcl       0.00      0.00      0.00         0

avg / total       0.62      0.56      0.55        32

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 3 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 1 0 0 0 0 0 1

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


The commented code is full of unsuccessful attempts to modify the CountVectorizer. I tried passing my own specific vocabulary. I tried different regular expressions. I tried analizing 2 and 3 character n-grams rather than words. None of these methods made a significant difference.

In [20]:
# my_vocab = ['function', '{', '}', '\n', '\t', ':', ';', 'def', ',', '->',
#             '(', ')', 'call', 'lambda', 'set', '@', '>', '<', '.', '[',
#             ']', 'var', 'elif', 'else', 'else if', 'then', 'in',
#             'switch', 'IfTrue', 'IfFalse', 'unless', 'not', 'elsif',
#             'given', 'end', 'match', '(if', '(otherwise', 'progn', 'begin',
#             'cond', 'then begin', 'with', 'when', 'foreach', 'for each',
#             'for_each', 'for (', '$i++', '$i', '$', 'do', 'until', 'loop',
#             'let loop', 'for-each', 'done', '.iter', 'catch', 'except',
#             'longjmp', 'setjmp', 'finally', 'throw', 'die', 'eval', '$@',
#             'rescue', 'ensure', 'handler-', 'check-', 'guard', 'try:',
#             'catchError', 'last', 'break', 'return-from',
#             'loop-finish', 'go', 'goto', 'next', 'func', 'void', 'int main',
#             'main', 'public', 'defun', 'setf', 'define', '&', '*', '/',
#             'require', ' = ', 'import', '__init__']
# cv = CountVectorizer(vocabulary=my_vocab)

# cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]', lowercase=False)
# cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|[^\w\d\s]+')

# cv = CountVectorizer(analyzer='char', ngram_range=(2,3))

# cv = CountVectorizer(lowercase=False)


In [48]:
def language_guesser(snippet):
    return my_classifier.predict([snippet])

In [49]:
language_guesser('''def an_imaginary_function:\n    return dict = {'a': 'B'}''')

array(['python'], 
      dtype='<U11')