In [24]:
import pandas as pd
import numpy as np
import glob
import re
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, normalize

## Reading in Data
Used glob to read a bunch of files with a similar path according to file extension. This allowed me to get just the languages I wanted in my data set. 

In [25]:
def read_prog_files(file_ext):
    files = glob.glob('/Users/kathrynjackson/Code/homework/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/**/*.{}'.format(file_ext), recursive=True)
    texts = []
    for file in files:
        with open(file) as f:
            texts.append(f.read())
    return texts, len(texts) * [file_ext]

Read in each type of file and concatenated data into x and y lists.

In [89]:
file_extensions = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'java', 'javascript', 'ocaml', 'perl', 'hack', 'php', 'python3', 'jruby', 'yarv', 'scala', 'racket']
X = []
y = []

for ext in file_extensions:
    x_texts, y_exts = read_prog_files(ext)
    X += x_texts
    y += y_exts
    
print(len(X), len(y))

552 552


## Train-Test Split
Used the train_test_split method from sklearn to split data set into 60/40 for training and testing the classifier.

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, train_size=0.6, random_state=898)

I wanted to includ the one '.c' file in my training data, not my test data, so I made sure the length of my training data was 16.

In [91]:
len(np.unique(y_train))

16

## Feature Extraction: Count Vectorizer
I used scikit-learn's count vectorizer to extract features from the data. I wanted words, white spaces, and puctuaion, but not numbers, which I assumed would be less language specific and more project specific. There are a lot of features in this model.

In [98]:
cv = CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')
cv.fit(X_train)
cv.transform(X_train)
print(len(cv.get_feature_names()))
print(cv.get_feature_names()[20:40])

4552
['<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', 'aa', 'aaa', 'aaron', 'aatacaaaaattagccgggcgtggtggcgcgcgcctgtaat', 'ability', 'able', 'abort', 'about', 'above', 'abs']


## Naive Bayes Classifier

In [99]:
baye_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
                      ('classifier', MultinomialNB())])

baye_pipe.fit(X_train, y_train)
baye_pipe.named_steps['vectorizer'].transform(X_train)

baye_pipe.score(X_test, y_test)

0.73303167420814475

## Decision Tree Classifier

In [112]:
from sklearn.tree import DecisionTreeClassifier

tree_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
#                       ('transformer', TfidfTransformer()),
                      ('classifier', DecisionTreeClassifier())])

tree_pipe.fit(X_train, y_train)
tree_pipe.named_steps['vectorizer'].transform(X_train)

print(tree_pipe.score(X_train, y_train))
print(tree_pipe.score(X_test, y_test))

0.990936555891
0.859728506787


## Random Forest Classifier
This meta estimator performs worse than the single decision tree classifier.

In [121]:
from sklearn.ensemble import RandomForestClassifier

forest_pipe = Pipeline([('vectorizer', CountVectorizer(token_pattern=r'[a-zA-Z]{2,}|\s|[^\w\d\s]')),
#                       ('transformer', TfidfTransformer()),
                        ('classifier', RandomForestClassifier())])

forest_pipe.fit(X_train, y_train)
forest_pipe.named_steps['vectorizer'].transform(X_train)

print(forest_pipe.score(X_train, y_train))
print(forest_pipe.score(X_test, y_test))

0.981873111782
0.823529411765
