In [138]:
import pandas as pd
import numpy as np
import glob
import re
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_union, FeatureUnion
from sklearn.preprocessing import PolynomialFeatures, normalize

## Reading in Data
Used glob to read a bunch of files with a similar path according to file extension. This allowed me to get just the languages I wanted in my data set. 

In [139]:
def read_prog_files(file_ext):
    files = glob.glob('/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/**/*.{}'.format(file_ext), recursive=True)
    texts = []
    for file in files:
        with open(file) as f:
            texts.append(f.read())
    return texts

Read in each type of file and concatenated data into x and y lists.

In [152]:
file_extensions = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'java', 'javascript', 'ocaml', 'perl', 'hack', 'php', 'python3', 'jruby', 'yarv', 'scala', 'racket']
ext_dict = {'jruby': 'ruby', 'csharp': 'c#', 'hack': 'php', 'sbcl': 'common lisp', 'ocaml': 'ocaml', 'python3': 'python', 'php': 'php', 'perl': 'perl', 'racket': 'scheme', 'c': 'c', 'javascript': 'javascript', 'gcc': 'c', 'yarv': 'ruby', 'java': 'java', 'clojure': 'clojure', 'scala': 'scala'}
X = []
y = []

for ext in file_extensions:
    x_texts = read_prog_files(ext)
    X += x_texts
    y += (len(x_texts) * [ext_dict[ext]])
    
print(len(X), len(y))
len(set(y))

552 552


13

## Train-Test Split
Used the train_test_split method from sklearn to split data set into 60/40 for training and testing the classifier.

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, train_size=0.6, random_state=890)
len(X_train)

331

I wanted to includ the one '.c' file in my training data, not my test data, so I made sure the length of my training data was 16.

In [154]:
len(np.unique(y_train))

13

## Feature Extraction: Count Vectorizer
I used scikit-learn's count vectorizer to extract features from the data. I wanted words, white spaces, and puctuaion, but not numbers, which I assumed would be less language specific and more project specific. There are a lot of features in this model.

In [387]:
cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|[^\w\d\s]+')
cv.fit(X_train)
cv.transform(X_train)
print(len(cv.get_feature_names()))
print(cv.get_feature_names()[20:40])
# cv.vocabulary_
# tuple_list = [(best_dict[key], key) for key in best_dict]
# sorted(tuple_list)

5857
["!='\\", '!=(', '!==', '!}', '!~', '"', '""', '"""', '""".', '""";', '""">.*\\', '"")', '""))', '"")))', '"");', '"",', '"".', '"";', '"#{', '"#{$']


## Naive Bayes Classifier

In [156]:
baye_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
                      ('classifier', MultinomialNB())])

baye_pipe.fit(X_train, y_train)
baye_pipe.named_steps['vectorizer'].transform(X_train)

baye_pipe.score(X_test, y_test)

0.92760180995475117

## Decision Tree Classifier

In [157]:
from sklearn.tree import DecisionTreeClassifier

tree_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
#                       ('transformer', TfidfTransformer()),
                      ('classifier', DecisionTreeClassifier())])

tree_pipe.fit(X_train, y_train)
tree_pipe.named_steps['vectorizer'].transform(X_train)

print(tree_pipe.score(X_train, y_train))
print(tree_pipe.score(X_test, y_test))

1.0
0.968325791855


## Random Forest Classifier
This meta estimator performs worse than the single decision tree classifier.

In [158]:
from sklearn.ensemble import RandomForestClassifier

forest_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
#                       ('transformer', TfidfTransformer()),
                        ('classifier', RandomForestClassifier())])

forest_pipe.fit(X_train, y_train)
forest_pipe.named_steps['vectorizer'].transform(X_train)

print(forest_pipe.score(X_train, y_train))
print(forest_pipe.score(X_test, y_test))

1.0
0.981900452489


## Testing

In [159]:
def read_test_files():
    files = glob.glob('/Users/kathrynjackson/Code/homework/assignments-master/week5/polyglot/test/*', recursive=True)
    tests = []
    for file in files:
        with open(file) as f:
            tests.append(f.read())
    return tests

In [160]:
import csv

testy_X = read_test_files()
testy_y = []
with open('/Users/kathrynjackson/Code/homework/assignments-master/week5/polyglot/test.csv') as test_targets:
    lines = csv.reader(test_targets)
    for line in lines:
        testy_y.append(line[1])

In [162]:
print(tree_pipe.score(testy_X, testy_y))
print(tree_pipe.predict(testy_X))

0.03125
['python' 'ocaml' 'javascript' 'javascript' 'ruby' 'ocaml' 'c' 'python'
 'python' 'python' 'ocaml' 'ocaml' 'ocaml' 'ocaml' 'java' 'javascript'
 'scala' 'scala' 'ocaml' 'python' 'php' 'ocaml' 'ocaml' 'php' 'python'
 'ocaml' 'python' 'python' 'python' 'python' 'python' 'javascript']


In [163]:
print(forest_pipe.score(testy_X, testy_y))
print(forest_pipe.predict(testy_X))

0.03125
['ocaml' 'ruby' 'ocaml' 'javascript' 'ruby' 'ruby' 'ruby' 'ruby' 'ruby'
 'ruby' 'ocaml' 'php' 'ruby' 'scheme' 'perl' 'ruby' 'python' 'c#' 'ruby'
 'perl' 'php' 'perl' 'ruby' 'ocaml' 'ocaml' 'c' 'c' 'python' 'python'
 'perl' 'python' 'javascript']


In [164]:
print(baye_pipe.score(testy_X, testy_y))
print(baye_pipe.predict(testy_X))

0.03125
['clojure' 'javascript' 'scala' 'scala' 'ruby' 'ruby' 'ruby' 'java' 'scala'
 'scala' 'scheme' 'clojure' 'scheme' 'scheme' 'c' 'c' 'scala' 'scala' 'php'
 'php' 'c' 'php' 'clojure' 'php' 'ocaml' 'ocaml' 'java' 'python' 'python'
 'python' 'python' 'javascript']


In [165]:
testy_y

['clojure',
 'clojure',
 'clojure',
 'clojure',
 'python',
 'python',
 'python',
 'python',
 'javascript',
 'javascript',
 'javascript',
 'javascript',
 'ruby',
 'ruby',
 'ruby',
 'haskell',
 'haskell',
 'haskell',
 'scheme',
 'scheme',
 'scheme',
 'java',
 'java',
 'scala',
 'scala',
 'tcl',
 'tcl',
 'php',
 'php',
 'php',
 'ocaml',
 'ocaml']

## Build Your Own

In [460]:
from sklearn.base import TransformerMixin

def caps_to_non(text):
    cap_letters = re.findall(r'[A-Z]', text)
    non_caps = re.findall(r'[a-z]', text)
    return len(cap_letters) / len(non_caps)
    
    
#                                 percent_occurence_of_parenthesis,
#                                 percent_occurence_of_curly,
#                                 percent_occurence_of_space,

# def next_to_last_character(text):
#     return text[-2]

def percent_occurence_of_parenthesis(text):
    pars = re.findall(r'\(|\)', text)
    return len(pars) / len(text)

def percent_occurence_of_curly(text):
    curls = re.findall(r'\{|\}', text)
    return len(curls) / len(text)

def percent_occurence_of_space(text):
    spaces = re.findall(r'\s', text)
    return len(spaces) / len(text)


def has_end(text):
    if re.search(r'end', text):
        return 1
    else:
        return 0

def occurence_of_this_pattern(reg_ex):
    
    def feature_fn(text):
        occ = re.findall(r'{}'.format(reg_ex), text)
        return len(occ)

    return feature_fn


class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        fvs = []
        for text in X:
            fv = [f(text) for f in self.featurizers]
            fvs.append(fv)
        return np.array(fvs)

In [461]:
# bag_plus_featurizer = make_union(
#     CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]'),
#     FunctionFeaturizer(number_of_capital_letters,
#                                 occurence_of_parenthesis,
#                                 occurence_of_curly,
#                                 occurence_of_space,
#                                 occurence_of_s_function,
#                                 occurence_of_punctuation,
#                                 new_line_per_char))

# bag_plus_featurizer.fit_transform(X[:10])
# baggy_pipe = Pipeline([('my_featurizer', bag_plus_featurizer()),
#                         ('my classifier', DecisionTreeClassifier())])
# baggy_pipe.fit(X_train, y_train)

In [462]:
# for text in X_train:
#     print(percent_occurence_of_punctuation(text))

In [492]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import binarize
from sklearn.svm import LinearSVC

featurizer = FunctionFeaturizer(caps_to_non,
                                percent_occurence_of_parenthesis,
                                percent_occurence_of_curly,
                                percent_occurence_of_space,
                                occurence_of_this_pattern('&\w'),
                                occurence_of_this_pattern('\$\w'),
                                occurence_of_this_pattern('[A-Za-z]+[A-Z]'))

# my_vocab = ['function', '{', '}', '\n', '\t', ':', ';', 'def', ',', '->',
#             '(', ')', 'call', 'lambda', 'set', '@', '>', '<', '.', '[',
#             ']', 'var', 'elif', 'else', 'else if', 'then', 'in',
#             'switch', 'IfTrue', 'IfFalse', 'unless', 'not', 'elsif',
#             'given', 'end', 'match', '(if', '(otherwise', 'progn', 'begin',
#             'cond', 'then begin', 'with', 'when', 'foreach', 'for each',
#             'for_each', 'for (', '$i++', '$i', '$', 'do', 'until', 'loop',
#             'let loop', 'for-each', 'done', '.iter', 'catch', 'except',
#             'longjmp', 'setjmp', 'finally', 'throw', 'die', 'eval', '$@',
#             'rescue', 'ensure', 'handler-', 'check-', 'guard', 'try:',
#             'catchError', 'last', 'break', 'return-from',
#             'loop-finish', 'go', 'goto', 'next', 'func', 'void', 'int main',
#             'main', 'public', 'defun', 'setf', 'define', '&', '*', '/',
#             'require', ' = ', 'import', '__init__']
# cv = CountVectorizer(vocabulary=my_vocab)

cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]', lowercase=False)
# cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|[^\w\d\s]+')

# cv = CountVectorizer(analyzer='char', ngram_range=(2,3))

# cv = CountVectorizer(lowercase=False)

feature_extractors = FeatureUnion([('my featurizer', featurizer), ('cv', cv)])

spca = PCA()


foresty = Pipeline([
                    ('featurizer', feature_extractors),
#                     ('transformer', TfidfTransformer()),
                    ('classifier', DecisionTreeClassifier()),
                    ('linsvc', LinearSVC()),
                    ])

foresty.fit(X_train, y_train)
foresty.named_steps['featurizer'].transform(X_train)
print(foresty.score(X_test, y_test))

print(foresty.score(testy_X, testy_y))
print(foresty.predict(testy_X))


# featurizer.fit(X_train)
# featurizer.transform(X_train)
# tree = DecisionTreeClassifier()
# tree.fit(featurizer.transform(X_train), y_train)
# tree.score(featurizer.transform(X_test), y_test)
# tree.predict(featurizer.transform(testy_X), testy_y)



0.941176470588
0.125
['clojure' 'javascript' 'ocaml' 'javascript' 'ruby' 'ruby' 'ruby' 'ocaml'
 'javascript' 'ocaml' 'javascript' 'javascript' 'javascript' 'php' 'java'
 'scala' 'php' 'scala' 'php' 'php' 'java' 'php' 'javascript' 'php' 'ocaml'
 'ocaml' 'clojure' 'javascript' 'scala' 'scala' 'scala' 'javascript']




In [469]:
# a_tree_pipe.predict(testy_X)

In [305]:
print(testy_y)

['clojure', 'clojure', 'clojure', 'clojure', 'python', 'python', 'python', 'python', 'javascript', 'javascript', 'javascript', 'javascript', 'ruby', 'ruby', 'ruby', 'haskell', 'haskell', 'haskell', 'scheme', 'scheme', 'scheme', 'java', 'java', 'scala', 'scala', 'tcl', 'tcl', 'php', 'php', 'php', 'ocaml', 'ocaml']


In [306]:
class MyTokenizer():
    def __init__(self):
        pass
    
    def __call__(self, text):
        return [do something for t in other tokenizer(text)]
    
vect = CountVectorizer(tokenizer=MyTokenizer())

SyntaxError: invalid syntax (<ipython-input-306-4794128bd099>, line 6)

In [341]:
my_vocab = ['function', '{', '}', '\n', '\t', ':', ';', 'def ', ',', '->',
            '(', ')', 'call', 'lambda', 'set', '@', '>', '<', '.', '[',
            ']', 'var', 'if', 'elif', 'else', 'else if', 'then', 'in',
            'switch', 'IfTrue', 'IfFalse', 'unless', 'not', 'elsif',
            'given', 'end', 'match', '(if', '(otherwise', 'progn', 'begin',
            'cond', 'then begin', 'with', 'when', 'foreach', 'for each',
            'for_each', 'for (', '$i++', '$i', '$', 'do', 'until', 'loop',
            'let loop', 'for-each', 'done', '.iter', 'catch', 'except',
            'longjmp', 'setjmp', 'finally', 'throw', 'die', 'eval', '$@',
            'rescue', 'ensure', 'handler-', 'check-', 'guard', 'try:',
            'catchError', 'last', 'break', 'return-from',
            'loop-finish', 'go', 'goto', 'next', 'func', 'void', 'int main',
            'main', 'public', 'defun', 'setf', 'define', '&', '*', '/',
            'require', ' = ', '--']
cv = CountVectorizer(vocabulary=my_vocab)

cv.fit(X_train)
cv.transform(X_train)
cv.vocabulary_

{'\t': 4,
 '\n': 3,
 ' = ': 94,
 '$': 51,
 '$@': 67,
 '$i': 50,
 '$i++': 49,
 '&': 90,
 '(': 10,
 '(if': 37,
 '(otherwise': 38,
 ')': 11,
 '*': 91,
 ',': 8,
 '--': 95,
 '->': 9,
 '.': 18,
 '.iter': 58,
 '/': 92,
 ':': 5,
 ';': 6,
 '<': 17,
 '>': 16,
 '@': 15,
 'IfFalse': 30,
 'IfTrue': 29,
 '[': 19,
 ']': 20,
 'begin': 40,
 'break': 76,
 'call': 12,
 'catch': 59,
 'catchError': 74,
 'check-': 71,
 'cond': 41,
 'def ': 7,
 'define': 89,
 'defun': 87,
 'die': 65,
 'do': 52,
 'done': 57,
 'elif': 23,
 'else': 24,
 'else if': 25,
 'elsif': 33,
 'end': 35,
 'ensure': 69,
 'eval': 66,
 'except': 60,
 'finally': 63,
 'for (': 48,
 'for each': 46,
 'for-each': 56,
 'for_each': 47,
 'foreach': 45,
 'func': 82,
 'function': 0,
 'given': 34,
 'go': 79,
 'goto': 80,
 'guard': 72,
 'handler-': 70,
 'if': 22,
 'in': 27,
 'int main': 84,
 'lambda': 13,
 'last': 75,
 'let loop': 55,
 'longjmp': 61,
 'loop': 54,
 'loop-finish': 78,
 'main': 85,
 'match': 36,
 'next': 81,
 'not': 32,
 'progn': 39,
 'pub

In [345]:
from sklearn.ensemble import RandomForestClassifier

forest_pipe = Pipeline([('vectorizer', CountVectorizer(vocabulary=my_vocab)),
                      ('transformer', TfidfTransformer()),
                        ('classifier', RandomForestClassifier())])

forest_pipe.fit(X_train, y_train)
forest_pipe.named_steps['vectorizer'].transform(X_train)

print(forest_pipe.score(X_train, y_train))
print(forest_pipe.score(X_test, y_test))

1.0
0.918552036199


## And now for something completely different

In [504]:
filenames = ['/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/binarytrees.gcc',
            '/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/binarytrees.gcc-2.gcc',
            '/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/binarytrees.jruby',
            '/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/binarytrees.jruby-3.jruby',
            '/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/binarytrees/binarytrees.clojure']

vectorizer = CountVectorizer(input='filename', token_pattern=r'[a-zA-Z]{2,}|[^\w\d\s]')

dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

In [505]:
dtm.toarray()
vocab = np.array(vocab)

In [513]:
par_idx = list(vocab).index('(')
dtm[4, par_idx]

87