### Classify RDP file using SciKit Learn Algorithms
* Override Tokenizer in TfidfVectorizer to produce kmers
* Override Tokenizer in TfidfVectorizer to produce minhash signatures from kmers
* Test both tf-idf and binary, sparse matrices

* kmer_length = 15, as larger lengths produce more unique tokens and large matrices
* Test matrices against common text classification algorithms including:
 * 'MultinomialNB','SGDClassifier','RandomForestClassifier','LinearSVC','LogisticRegression'
* In each instance perform 10-fold cross validation using StratifiedKFold
* Use random_state = 42 whenever possbile


### Load the RDP Test file

In [1]:
def read_fasta_file(file_path):
    X = []
    y = []
    with open(file_path, 'r') as dat:
        for line in dat.readlines():
            #print(line)
            if line[0] == '>':
                g_start = line.find("g__")
                g_end = line.find(";", g_start)
                genus = line[g_start:g_end]
                y.append(genus)
            else:
                X.append(line)
    return X, y

In [2]:
import pandas as pd

# Read in the fasta file
X, y = read_fasta_file('D:/StrandPy/Data/RDP_All_Clean.strand')
#Remove low count genera that halt cross validation
data = {'X': X,'y':y}
df = pd.DataFrame(data)

#Inspect counts for each class
vcts = df.y.value_counts()
low_vcts = vcts[vcts < 20].index.values

#Remove low count genera. Any classes < the fold count will halt cross validation
df = df[~df.y.isin(low_vcts)]

# Create X and y
X = df.X.values
y = df.y.values

print('File Size: ' + str(len(X)))
print('Longest Sequence Chars:' + str(len(max(X, key=len))))

File Size: 4786
Longest Sequence Chars:1834


### Create a tokenizer with no minhashing

In [57]:
kmer_length=15

def kmer_generator(text):
    kmer_count = len(text) - (kmer_length - 1)
    for i in range(0, kmer_count):
        yield text[i:i + kmer_length]

### Create a TfidfVectorizer that uses the naive kmer_generator implementation

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
        
tfidf = TfidfVectorizer(tokenizer=kmer_generator, binary=False, use_idf=True)
X_tfidf = tfidf.fit_transform(X)

print('Total unique kmers: ', len(tfidf.get_feature_names()))
print('X_tfidf shape:, ', X_tfidf.shape)
print('y shape: ', y.shape)

Total unique kmers:  590930
X_tfidf shape:,  (4786, 590930)
y shape:  (4786,)


In [59]:
import sys
print('Size of our sparse X_tfidf: ', sys.getsizeof(X_tfidf))

Size of our sparse X_tfidf:  56


In [60]:
X_tfidf

<4786x590930 sparse matrix of type '<class 'numpy.float64'>'
	with 6964226 stored elements in Compressed Sparse Row format>

### Now Test our sparse matrix against a bunch of models

In [61]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
import time

def stratified_cross_validate(model, X, y, cv):
    start = time.time()
    cv_results = cross_validate(model, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    elapsed_time = (time.time() - start) 
    print(cv_results)
    print(' ')
    print('Mean Accuracy: ', cv_results['test_score'].mean())
    print('Wall Time: ', elapsed_time)
    return

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, random_state=42)

models = [
    MultinomialNB(),
    SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None),
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42),
    LinearSVC(random_state=42),
    LogisticRegression(random_state=42)   
]

model_names = ['MultinomialNB','SGDClassifier','RandomForestClassifier','LinearSVC','LogisticRegression']

for model, model_name in zip(models,model_names):
    print(model_name)
    print('--------------------------------')
    stratified_cross_validate(model,X_tfidf,y,cv)
    print(' ')

MultinomialNB
--------------------------------
{'fit_time': array([6.90475702, 7.01076293, 6.84175682, 6.45990849, 6.93575382,
       6.56391168, 7.06874108, 6.90975547, 3.27616954, 3.08183956]), 'score_time': array([0.71744728, 0.97903275, 0.90545058, 1.24929428, 0.98145032,
       0.7589643 , 0.92803264, 0.74245143, 0.33001399, 0.32001662]), 'test_score': array([0.86692759, 0.86706349, 0.86788618, 0.87679671, 0.88075314,
       0.86919831, 0.89079229, 0.88720174, 0.88427948, 0.87665198])}
 
Mean Accuracy:  0.8767550913710789
Wall Time:  11.474984884262085
 
SGDClassifier
--------------------------------
{'fit_time': array([43.45372415, 44.07971931, 44.01722527, 44.11097193, 43.51723051,
       43.67147565, 43.55710268, 44.1943531 , 12.28191304, 12.29127908]), 'score_time': array([0.73437047, 0.69882703, 0.72316742, 0.68257689, 0.68749523,
       0.65624619, 0.67187572, 0.64480805, 0.31299281, 0.32799745]), 'test_score': array([0.98825832, 0.99603175, 0.98780488, 0.99794661, 0.9937238

### Create a tokenizer that uses minhashing

In [41]:
kmer_length=15
minhash_length = 250

def kmer_generator(text):
    kmer_count = len(text) - (kmer_length - 1)
    for i in range(0, kmer_count):
        yield text[i:i + kmer_length]
        
def create_minhash_signature(text):
    # Break sequence into a unique set of hashed kmers
    kmer_hashes = list(set(hash(kmer) for kmer in kmer_generator(text)))
    kmer_hashes.sort()

    if minhash_length==None:
        #Return all of the kmer_hashes as the signature 
        signature = kmer_hashes
    else:
        #Take the first minhash_length hashes to create the minhash signature 
        signature = kmer_hashes[:minhash_length]

    return signature

### Create a TfidfVectorizer that uses the minhash implementation

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
        
tfidf = TfidfVectorizer(tokenizer=create_minhash_signature, binary=False, use_idf=True)
X_tfidf = tfidf.fit_transform(X)

print('Total unique minhashes: ', len(tfidf.get_feature_names()))
print('X_tfidf shape:, ', X_tfidf.shape)
print('y shape: ', y.shape)

Total unique minhashes:  101870
X_tfidf shape:,  (4786, 101870)
y shape:  (4786,)


In [43]:
import sys
print('Size of our sparse X_tfidf: ', sys.getsizeof(X_tfidf))

Size of our sparse X_tfidf:  56


In [44]:
X_tfidf

<4786x101870 sparse matrix of type '<class 'numpy.float64'>'
	with 1196500 stored elements in Compressed Sparse Row format>

### Now Test our sparse MinHash matrix against a bunch of models

In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
import time

def stratified_cross_validate(model, X, y, cv):
    start = time.time()
    cv_results = cross_validate(model, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    elapsed_time = (time.time() - start) 
    print(cv_results)
    print(' ')
    print('Mean Accuracy: ', cv_results['test_score'].mean())
    print('Wall Time: ', elapsed_time)
    return

In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, random_state=42)

models = [
    MultinomialNB(),
    SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None),
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42),
    LinearSVC(random_state=42),
    LogisticRegression(random_state=42)   
]

model_names = ['MultinomialNB','SGDClassifier','RandomForestClassifier','LinearSVC','LogisticRegression']

for model, model_name in zip(models,model_names):
    print(model_name)
    print('--------------------------------')
    stratified_cross_validate(model,X_tfidf,y,cv)
    print(' ')

MultinomialNB
--------------------------------
{'fit_time': array([0.89999413, 0.99600077, 0.87200952, 0.87200952, 0.91000915,
       0.98100114, 1.01599026, 1.01900077, 0.49891543, 0.47329736]), 'score_time': array([0.12200212, 0.10199022, 0.14098859, 0.13598609, 0.11399889,
       0.10198879, 0.09299755, 0.08999133, 0.06250215, 0.04687166]), 'test_score': array([0.89236791, 0.88492063, 0.88617886, 0.90349076, 0.89539749,
       0.89029536, 0.90792291, 0.90455531, 0.90393013, 0.88986784])}
 
Mean Accuracy:  0.8958927209872168
Wall Time:  1.767406702041626
 
SGDClassifier
--------------------------------
{'fit_time': array([3.99300385, 4.00099683, 4.01599455, 4.07999969, 4.02299476,
       4.00499701, 4.0339942 , 4.09973288, 1.49460292, 1.55561543]), 'score_time': array([0.10900593, 0.11000896, 0.10800433, 0.08510661, 0.10400939,
       0.10899878, 0.10000539, 0.05571437, 0.04999876, 0.05950284]), 'test_score': array([0.98825832, 0.99603175, 0.98780488, 0.99794661, 0.99372385,
       0

### Create a Binary TfidfVectorizer that uses the minhash implementation

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
        
tfidf = TfidfVectorizer(tokenizer=create_minhash_signature, binary=True, use_idf=False, norm=None)
X_tfidf = tfidf.fit_transform(X)

print('Total unique minhashes: ', len(tfidf.get_feature_names()))
print('X_tfidf shape:, ', X_tfidf.shape)
print('y shape: ', y.shape)

Total unique minhashes:  101870
X_tfidf shape:,  (4786, 101870)
y shape:  (4786,)


In [64]:
import sys
print('Size of our sparse X_tfidf: ', sys.getsizeof(X_tfidf))

Size of our sparse X_tfidf:  56


In [65]:
X_tfidf

<4786x101870 sparse matrix of type '<class 'numpy.float64'>'
	with 1196500 stored elements in Compressed Sparse Row format>

### Now Test our sparse, binary MinHash matrix against a bunch of models

In [70]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
import time

def stratified_cross_validate(model, X, y, cv):
    start = time.time()
    cv_results = cross_validate(model, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    elapsed_time = (time.time() - start) 
    print(cv_results)
    print(' ')
    print('Mean Accuracy: ', cv_results['test_score'].mean())
    print('Wall Time: ', elapsed_time)
    return

In [71]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, random_state=42)

models = [
    MultinomialNB(),
    SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None),
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=42),
    LinearSVC(random_state=42),
    LogisticRegression(random_state=42)   
]

model_names = ['MultinomialNB','SGDClassifier','RandomForestClassifier','LinearSVC','LogisticRegression']

for model, model_name in zip(models,model_names):
    print(model_name)
    print('--------------------------------')
    stratified_cross_validate(model,X_tfidf,y,cv)
    print(' ')

MultinomialNB
--------------------------------
{'fit_time': array([0.90298414, 0.90298414, 0.88735485, 0.88735485, 0.88735485,
       0.88735485, 0.90298223, 0.92529416, 0.4060266 , 0.39039683]), 'score_time': array([0.10936332, 0.10936332, 0.1093719 , 0.1093719 , 0.1093719 ,
       0.1093719 , 0.09374332, 0.09375405, 0.04685569, 0.06248546]), 'test_score': array([0.97455969, 0.97619048, 0.96341463, 0.97741273, 0.9707113 ,
       0.96835443, 0.98072805, 0.97830803, 0.98253275, 0.969163  ])}
 
Mean Accuracy:  0.9741375079790956
Wall Time:  3.730381488800049
 
SGDClassifier
--------------------------------
{'fit_time': array([4.08680749, 4.02349257, 4.00786662, 4.1019001 , 4.16441011,
       4.13315654, 4.15740633, 4.22555017, 1.4250977 , 1.40947151]), 'score_time': array([0.09222054, 0.09404564, 0.10967159, 0.10938478, 0.10378766,
       0.11939359, 0.10378766, 0.06252503, 0.04687715, 0.04686904]), 'test_score': array([0.99021526, 0.99404762, 0.99186992, 0.99794661, 0.99372385,
       0