In [100]:
#handles the stored webscrapped data
import pickle
import glob

#testing the files
from csv import reader

#handles the machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

# Used to flatten dictionary
from operator import add
from itertools import chain
from functools import reduce

# Panda to make things look pretty
import pandas as pd

# MultinomialNB With TFIDF 

MultinomialNB, or MNB, is a supervised learning algorithm that looks at probability at how often an item occurs and estimates the % change of that. TFID creats a score based on how often an item occurs. By summing the total score of a particular phrase we are able to start pointing to what the prediction could be.

In [2]:
#import pickle file of langauge data
lang_semantics = pickle.load(open("language_data/data.p", "rb"))
# for key in lang_semantics:
#     print(len(lang_semantics[key]))

In [3]:
# Flatten the dict_lsit
lang_X = []
lang_y = []
for key in lang_semantics:
    for item in lang_semantics[key]:
        lang_X.append(item)
        lang_y.append(key)

In [10]:
lang_X_train, lang_X_test, lang_y_train, lang_y_test = train_test_split(lang_X, lang_y)

In [5]:
pipeline_map = [('bag_of_words', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('bayes', MultinomialNB())]

In [6]:
pipeline = Pipeline(pipeline_map)

In [7]:
hashpipe = pipeline.fit(lang_X_train, lang_y_train)

In [49]:
print('MNB train score: ', hashpipe.score(lang_X_train, lang_y_train))
print('MNB test score: ', hashpipe.score(lang_X_test, lang_y_test))

MNB train score:  0.843673067454
MNB test score:  0.847970479705


### Probability of Predictions - MultinomialNB With TFIDF  

In [23]:
probability = {}
temp_list = []
for item in set(lang_y):
    predictions = pipeline.predict(lang_X_test)
    predicted = list(predictions).count(item)
    total = len(lang_X_test)
    print('{0} : {1} / {2} = {3:.2%}'.format(item, predicted, total, predicted / total))
    temp_list.append(predicted/total)
probability['MNB'] = temp_list
    

Haskell : 94 / 1355 = 6.94%
Python : 208 / 1355 = 15.35%
HicEst : 0 / 1355 = 0.00%
Scheme : 15 / 1355 = 1.11%
Java : 123 / 1355 = 9.08%
PHP : 29 / 1355 = 2.14%
Scala : 89 / 1355 = 6.57%
C# : 64 / 1355 = 4.72%
Clojure : 32 / 1355 = 2.36%
Perl : 138 / 1355 = 10.18%
Ruby : 163 / 1355 = 12.03%
C++ : 102 / 1355 = 7.53%
OCaml : 77 / 1355 = 5.68%
C : 148 / 1355 = 10.92%
JavaScript : 73 / 1355 = 5.39%


# Support Vector Machine With TFIDF

SVM uses a 3D space instead of 2D to map out the features.

In [19]:
from sklearn.linear_model import SGDClassifier

In [119]:
pipeline1 = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=2, random_state=42)),
])

In [120]:
weezer = pipeline1.fit(lang_X_train, lang_y_train)
temp_list=[]
for item in set(lang_y):
    predictions1 = pipeline1.predict(lang_X_test)
    predicted = list(predictions1).count(item)
    total = len(lang_X_test)
    print('{0} : {1} / {2} = {3:.2%}'.format(item, predicted, total, predicted / total))
    temp_list.append(predicted/total)
probability['SVM'] = temp_list

Haskell : 91 / 1355 = 6.72%
Python : 151 / 1355 = 11.14%
HicEst : 17 / 1355 = 1.25%
Scheme : 34 / 1355 = 2.51%
Java : 110 / 1355 = 8.12%
PHP : 51 / 1355 = 3.76%
Scala : 98 / 1355 = 7.23%
C# : 58 / 1355 = 4.28%
Clojure : 57 / 1355 = 4.21%
Perl : 129 / 1355 = 9.52%
Ruby : 144 / 1355 = 10.63%
C++ : 99 / 1355 = 7.31%
OCaml : 83 / 1355 = 6.13%
C : 134 / 1355 = 9.89%
JavaScript : 99 / 1355 = 7.31%


In [121]:
print('SVM train score: ', weezer.score(lang_X_train, lang_y_train))
print('SVM test score: ', weezer.score(lang_X_test, lang_y_test))

SVM train score:  0.955440669621
SVM test score:  0.883394833948


# Grid Search

Help determine the best parameters for searching

In [112]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(weezer, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(lang_X_train, lang_y_train)
gs_predict = gs_clf.predict(lang_X_test)
temp_list=[]
for item in set(lang_y):
    predicted = list(gs_predict).count(item)
    total = len(lang_X_test)
    print('{0} : {1} / {2} = {3:.2%}'.format(item, predicted, total, predicted / total))
    temp_list.append(predicted/total)
probability['GS'] = temp_list

Haskell : 95 / 1355 = 7.01%
Python : 146 / 1355 = 10.77%
HicEst : 13 / 1355 = 0.96%
Scheme : 33 / 1355 = 2.44%
Java : 112 / 1355 = 8.27%
PHP : 50 / 1355 = 3.69%
Scala : 99 / 1355 = 7.31%
C# : 64 / 1355 = 4.72%
Clojure : 60 / 1355 = 4.43%
Perl : 135 / 1355 = 9.96%
Ruby : 138 / 1355 = 10.18%
C++ : 100 / 1355 = 7.38%
OCaml : 84 / 1355 = 6.20%
C : 130 / 1355 = 9.59%
JavaScript : 96 / 1355 = 7.08%


In [113]:
temp_list=[]
for item in set(lang_y):
    predicted = list(gs_predict).count(item)
    total = len(lang_X_test)
    print('{0} : {1} / {2} = {3:.2%}'.format(item, predicted, total, predicted / total))
    temp_list.append(predicted/total)
probability['GS'] = temp_list

Haskell : 95 / 1355 = 7.01%
Python : 146 / 1355 = 10.77%
HicEst : 13 / 1355 = 0.96%
Scheme : 33 / 1355 = 2.44%
Java : 112 / 1355 = 8.27%
PHP : 50 / 1355 = 3.69%
Scala : 99 / 1355 = 7.31%
C# : 64 / 1355 = 4.72%
Clojure : 60 / 1355 = 4.43%
Perl : 135 / 1355 = 9.96%
Ruby : 138 / 1355 = 10.18%
C++ : 100 / 1355 = 7.38%
OCaml : 84 / 1355 = 6.20%
C : 130 / 1355 = 9.59%
JavaScript : 96 / 1355 = 7.08%


In [114]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


## Grid Search W/Updated Parameters

In [115]:
parameters = {'vect__ngram_range': [(1, 2)],
              'tfidf__use_idf': (True, True),
              'clf__alpha': (.001, .001),
}
gs_clf_updated = GridSearchCV(weezer, parameters, n_jobs=-1)
gs_clf_updated = gs_clf_updated.fit(lang_X_train, lang_y_train)
gs_predict_updated = gs_clf_updated.predict(lang_X_test)
temp_list=[]
for item in set(lang_y):
    predicted = list(gs_predict_updated).count(item)
    total = len(lang_X_test)
    print('{0} : {1} / {2} = {3:.2%}'.format(item, predicted, total, predicted / total))
    temp_list.append(predicted/total)
probability['GS_Updated'] = temp_list

Haskell : 95 / 1355 = 7.01%
Python : 146 / 1355 = 10.77%
HicEst : 13 / 1355 = 0.96%
Scheme : 33 / 1355 = 2.44%
Java : 112 / 1355 = 8.27%
PHP : 50 / 1355 = 3.69%
Scala : 99 / 1355 = 7.31%
C# : 64 / 1355 = 4.72%
Clojure : 60 / 1355 = 4.43%
Perl : 135 / 1355 = 9.96%
Ruby : 138 / 1355 = 10.18%
C++ : 100 / 1355 = 7.38%
OCaml : 84 / 1355 = 6.20%
C : 130 / 1355 = 9.59%
JavaScript : 96 / 1355 = 7.08%


# Metric Reports

In [73]:
print("MultinomialNB With TFIDF")
print(metrics.classification_report(hashpipe.predict(lang_X_test), lang_y_test))
print("----------------------------------------------------")
print("Support Vector Machine With TFIDF")
print(metrics.classification_report(weezer.predict(lang_X_test), lang_y_test))
print("----------------------------------------------------")
print("Support Vector Machine With TFIDF")
print(metrics.classification_report(gs_clf.predict(lang_X_test), lang_y_test))
print("----------------------------------------------------")

MultinomialNB With TFIDF
             precision    recall  f1-score   support

          C       0.97      0.76      0.85       148
         C#       0.86      0.92      0.89        64
        C++       0.91      0.99      0.95       102
    Clojure       0.46      1.00      0.63        32
    Haskell       0.91      0.93      0.92        94
     HicEst       0.00      0.00      0.00         0
       Java       0.95      0.82      0.88       123
 JavaScript       0.82      0.96      0.89        73
      OCaml       0.89      0.99      0.94        77
        PHP       0.42      1.00      0.59        29
       Perl       0.98      0.86      0.92       138
     Python       0.94      0.67      0.78       208
       Ruby       0.95      0.79      0.86       163
      Scala       0.88      0.89      0.88        89
     Scheme       0.47      1.00      0.64        15

avg / total       0.90      0.85      0.86      1355

----------------------------------------------------
Support Vector Mac

  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

          C       0.97      0.88      0.92       130
         C#       0.87      0.94      0.90        64
        C++       0.89      0.99      0.94       100
    Clojure       0.81      0.95      0.88        60
    Haskell       0.92      0.93      0.92        95
     HicEst       0.68      1.00      0.81        13
       Java       0.91      0.86      0.88       112
 JavaScript       0.93      0.82      0.87        96
      OCaml       0.91      0.92      0.91        84
        PHP       0.70      0.96      0.81        50
       Perl       0.97      0.87      0.92       135
     Python       0.88      0.90      0.89       146
       Ruby       0.90      0.88      0.89       138
      Scala       0.91      0.83      0.87        99
     Scheme       0.94      0.91      0.92        33

avg / total       0.90      0.90      0.90      1355

----------------------------------------------------


# Probability Comparison

In [122]:
df = pd.DataFrame(probability)
df['lang'] = set(lang_y)
df.index = df.pop('lang')
df

Unnamed: 0_level_0,GS,GS_Updated,MNB,SVM
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Haskell,0.070111,0.070111,0.069373,0.067159
Python,0.107749,0.107749,0.153506,0.111439
HicEst,0.009594,0.009594,0.0,0.012546
Scheme,0.024354,0.024354,0.01107,0.025092
Java,0.082657,0.082657,0.090775,0.081181
PHP,0.0369,0.0369,0.021402,0.037638
Scala,0.073063,0.073063,0.065683,0.072325
C#,0.047232,0.047232,0.047232,0.042804
Clojure,0.04428,0.04428,0.023616,0.042066
Perl,0.099631,0.099631,0.101845,0.095203


# Test Files Test

In [123]:
with open('test_data/test.csv') as test_csv:
    test_files = reader(test_csv)
    
    guess_dict = {}
    three_class = ['Correct_Answer','MNB', 'SVM', 'GS']
    mnb_temp = []
    svm_temp = []
    gs_temp = []
    ca_temp = []
    for line in test_files:
        with open('test_data/test/'+line[0]) as test:
            mnb = hashpipe.predict([test.read()])
            svm = weezer.predict([test.read()])
            grid = gs_clf.predict([test.read()])
            mnb_temp.append(mnb[0])
            svm_temp.append(svm[0])
            gs_temp.append(grid[0])
            ca_temp.append(line[1])
df_test = pd.DataFrame()
df_test['Correct_Answer'] = ca_temp
df_test['MNB'] = mnb_temp
df_test['SVM'] = svm_temp
df_test['GS'] = gs_temp

In [125]:
df_test

Unnamed: 0,Correct_Answer,MNB,SVM,GS
0,clojure,Perl,Ruby,Ruby
1,clojure,Java,Ruby,Ruby
2,clojure,Ruby,Ruby,Ruby
3,clojure,Ruby,Ruby,Ruby
4,python,Python,Ruby,Ruby
5,python,Python,Ruby,Ruby
6,python,Python,Ruby,Ruby
7,python,Python,Ruby,Ruby
8,javascript,JavaScript,Ruby,Ruby
9,javascript,JavaScript,Ruby,Ruby
