In [1]:
import sys
sys.path.append('/home/jonas/peppred/src/')
from data import get_data, transform_data

In [95]:
# Heavily Inspired from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html#sphx-glr-auto-examples-text-document-classification-20newsgroups-py

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# Begin Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
# End Classifiers
from sklearn.metrics import accuracy_score

In [40]:
data = transform_data(get_data())
examples = [str(seq['sequence']) for seq in data]
labels = [item['class'] for item in data]
n_gram_range = (2,2)
vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=n_gram_range)
transformer = TfidfTransformer()
counts = vectorizer.fit_transform(examples)
tfidf = transformer.fit_transform(counts)
tfidf

<2654x452 sparse matrix of type '<class 'numpy.float64'>'
	with 469678 stored elements in Compressed Sparse Row format>

In [55]:
# There is a class called TfidfVectorizer which combines the above steps
data = transform_data(get_data())
examples = [str(seq['sequence']) for seq in data]
labels = [item['class'] for item in data]
x_train, x_test, y_train, y_test = train_test_split(examples, labels, test_size=0.1, random_state=99)
n_gram_range = (2,2)
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=n_gram_range)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
# Mapping from integer feature name to original token string (Doesn't work with hashing vectorizer)
feature_names = vectorizer.get_feature_names() 

In [99]:
def benchmark_classifiers(classifiers, x_train, y_train, x_test, y_test):
    scores = {}
    for clf, name in classifiers:
        clf.fit(x_train, y_train)
        predictions = clf.predict(x_test)
        accuracy = accuracy_score(y_test, predictions)
        print(f"{name}: {accuracy}")
        scores[name] = accuracy
    return scores
        
classifiers = [
    (KNeighborsClassifier(n_neighbors=12), "kNN"),
    (RidgeClassifier(tol=1e-2, solver='lsqr'), 'Ridge regression'),
    (Perceptron(n_iter=20), 'Perceptron'),
    (PassiveAggressiveClassifier(n_iter=10), 'Passive Agressive Classifier'),
    (RandomForestClassifier(n_estimators=200), 'Random Forest'),
    (MultinomialNB(alpha=0.01), 'Multinomial Naive Bayes'),
    (BernoulliNB(alpha=0.01), "Bernoulli Naive Bayes"),
    (LinearSVC(penalty='l2', tol=1e-3), "SVM"),
]
benchmark_classifiers(classifiers, x_train, y_train, x_test, y_test)



kNN: 0.7556390977443609
Ridge regression: 0.8195488721804511
Perceptron: 0.8308270676691729
Passive Agressive Classifier: 0.8120300751879699
Random Forest: 0.8345864661654135
Multinomial Naive Bayes: 0.793233082706767
Bernoulli Naive Bayes: 0.6578947368421053
SVM: 0.8195488721804511


{'Bernoulli Naive Bayes': 0.65789473684210531,
 'Multinomial Naive Bayes': 0.79323308270676696,
 'Passive Agressive Classifier': 0.81203007518796988,
 'Perceptron': 0.83082706766917291,
 'Random Forest': 0.83458646616541354,
 'Ridge regression': 0.81954887218045114,
 'SVM': 0.81954887218045114,
 'kNN': 0.75563909774436089}