# Final Model: Aggregated Probability
After trying several approaches in the sandbox notebooks, I settled on 
the following approach given a time constraint: The aggregated probabilities
of a nearest neighbors classifier on sequence permutations (TFIDF scores) and 
and a random forest classifier on the one-hot encoded portion of the training 
data. The following is a distillation of the best scoring process in the sandbox

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from itertools import permutations
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

### Preparing The Data
Before being able to fit any models, some feature engineering needs to be performed on the training observations, and the training labels need to be collapsed down to a usable form.

In [2]:
values = pd.read_csv("train_values.csv")
labels = pd.read_csv("train_labels.csv")

In [3]:
# create the main label series

lab_ids = labels.columns[1:]

# get numpy matrix of lab_id one-hot values
lab_matrix = labels.drop(columns=['sequence_id']).values

# get array of indices to map back to lab_ids
lab_col_indices = np.asarray(lab_matrix == 1).nonzero()[1]

labels['lab_id'] = lab_ids[lab_col_indices]
y = labels[['sequence_id', 'lab_id']]
y.head()

Unnamed: 0,sequence_id,lab_id
0,9ZIMC,RYUA3GVO
1,5SAQC,RYUA3GVO
2,E7QRO,RYUA3GVO
3,CT5FP,RYUA3GVO
4,7PTD8,RYUA3GVO


In [5]:
# verify that all labels are correct
def correct_label(row_n):
    return labels.iloc[row_n][labels['lab_id'].iloc[row_n]] == 1

assert all(list(map(correct_label, range(labels.shape[0]))))

In order to effectively use RandomizedSearchCV, some of the observations where classes had very few members needed to be 
oversampled

In [6]:
# these are the classes with less than 5 members
# need all to have at least 5 for randomized search cv
print(y['lab_id'].value_counts().tail(19))

# create resampling map
resample_map = y['lab_id'].value_counts().tail(19)
resample_map = list(zip(resample_map.index, resample_map.values))

# temporarily add 'lab_id' back to
over_samp = pd.concat([values, y['lab_id']], axis=1)

W2DYAZID    4
03GRNN7N    4
RZPGGEG4    4
68OY1RK5    4
VDSDXJ71    4
1KZHNVYR    4
UMOD7PGG    4
PXT3AJ7C    4
8N5EPD5C    4
WM3Q8LBC    4
INDCDVP0    4
XCWSW5T9    4
YCD71LRY    4
LGTP4O86    4
G2P73NZ0    3
58BSUZQB    3
WB78G3XF    2
0L3Y6ZB2    1
ON9AXMKF    1
Name: lab_id, dtype: int64


In [7]:
def oversample(df, resamp_map, min_samps=5):
    """
    Randomly oversample rows that belong to classes with less than N
    members
    
    df: pandas DataFrame object
    resamp_map: reference values for classes and resample amount
    """
    new_df = df.copy()
    for class_, members in resamp_map:
        for i in range(min_samps - members):
            new_row = new_df.loc[new_df['lab_id'] == class_].sample(n=1)
            new_df = new_df.append(new_row)
    
    return new_df

Now each class has a minimum of 5 members

In [8]:
over_samp = oversample(over_samp, resample_map)
over_samp['lab_id'].value_counts().tail(20)

A4BM0B6A    5
4RHLX089    5
5Z4CMIY5    5
V4A28VLV    5
QNKGHIRB    5
YCD71LRY    5
MH0GC0GY    5
LUHRMKEB    5
2L336TQL    5
58BSUZQB    5
SBQXQOPV    5
RZPGGEG4    5
1KZHNVYR    5
TU2W2LCB    5
L905DK46    5
UYLJZRPN    5
DJW5U56I    5
WM3Q8LBC    5
0L3Y6ZB2    5
W2DYAZID    5
Name: lab_id, dtype: int64

Now that the data is oversampled for use with RandomizedSearchCV, a function can be created that will take train and test data, and output the appropriate data set for each model.

In [9]:
# first, ngrams for sequences must be made
N_GRAMS = 4

# create the 'vocabulary' for the different nucleotides
n_tides = set(''.join(over_samp['sequence'].values))

# create a list of subsequences for features
subseqs = list(''.join(p) for p in permutations(n_tides, r=N_GRAMS))
subseqs[:5]

['CNGT', 'CNGA', 'CNTG', 'CNTA', 'CNAG']

In [20]:
def separate(df, ss, transform=False, normalizer=None):
    """
    Splits train or testing data into one-hot
    encoded data and tokenized sequences
    
    df: The pandas dataframe to be split
    
    ss: The subsequences to tokenize from
    
    transform: whether to transform 
               tokens as opposed to fitting
               
    score_func: function to apply to token counts
    
    Returns:
    
    one_hot: pandas DataFrame object with one-hot columns
    
    sub_sequences: pandas DataFrame object with subsequences vectorized
    """
    sub_sequences = pd.DataFrame(index=df.index)
    
    for seq in ss:
        sub_sequences[seq] = df["sequence"].str.count(seq)
    
    one_hot = df.drop(columns=["sequence", "sequence_id"])
    
    if not transform:
        # call fit_transform on normalizer
        sub_sequences = normalizer.fit_transform(sub_sequences)
    else:
        sub_sequences = normalizer.transform(sub_sequences)
        
    return one_hot, sub_sequences
        

In [23]:
# create transformer and label encoder

tfidf = TfidfTransformer()
le = LabelEncoder()

# set y first and encode
y = over_samp['lab_id']
y = le.fit_transform(y)

over_samp = over_samp.drop(columns=['lab_id'])

# create one-hot X and sequence X
Xoh, Xsq = separate(over_samp, subseqs, normalizer=tfidf)

print(Xoh.shape)
print(Xsq.shape)
print(y.shape)

(63046, 39)
(63046, 120)
(63046,)


### Fitting Models
The most successful models were produced by a cross-validated randomized search through a nearest neighbors classifier and a 
random forest classifier. These are two separate searches.

In [25]:
# contruct rfc with random search
rfc = RandomForestClassifier(random_state=0)

params = {"n_estimators": range(50, 150, 25),
          "criterion": ["gini", "entropy"],
          "max_depth": range(5, 20, 5),
          "min_samples_leaf": range(1, 10),
          "max_features": ["sqrt", "log2"]}

rs = RandomizedSearchCV(rfc, params, random_state=0, verbose=1, n_jobs=-1, n_iter=5)
rfc_search = rs.fit(Xoh, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  4.0min finished


In [27]:
# construct the neighbors model
nn_classifier = KNeighborsClassifier(algorithm="brute", metric="cosine", leaf_size=40)

params = {"n_neighbors": range(1,40),
          "weights": ["uniform", "distance"],
          "leaf_size": range(20, 50, 10)}

rs_nn = RandomizedSearchCV(nn_classifier, params, random_state=0, 
                         verbose=1, n_jobs=-1, n_iter=5)
nn_search = rs_nn.fit(Xsq, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 14.6min finished


In [31]:
# best estimators
print(rfc_search.best_estimator_)
print("=" * 50)
print(nn_search.best_estimator_)

RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=3,
                       random_state=0)
KNeighborsClassifier(algorithm='brute', metric='cosine', n_neighbors=12,
                     weights='distance')


### Creating Submission Data
At this point, the new models need to be applied to the 
test data. This will allow the creation of an aggregated 
probability submission.

In [32]:
# format testing data
test = pd.read_csv("test_values.csv")

# split data
Xoh_test, Xsq_test = separate(test, subseqs, 
                              transform=True, normalizer=tfidf)
print(Xoh_test.shape)
print(Xsq_test.shape)

(18816, 39)
(18816, 120)


In [39]:
# predict probabilities for each test set
rfc_probs = rfc_search.best_estimator_.predict_proba(Xoh_test)
nn_probs = nn_search.best_estimator_.predict_proba(Xsq_test)

In [40]:
# create aggregated probabilities
agg_probs = (rfc_probs + nn_probs) / 2

In [42]:
# prepare submission
sub_format = pd.read_csv("submission_format_3TFRxH6.csv", index_col='sequence_id')

assert sub_format.shape == agg_probs.shape
assert (le.classes_ == sub_format.columns).all()

In [43]:
agg_submission = pd.DataFrame(data=agg_probs,
                              columns=le.classes_,
                              index=sub_format.index)
agg_submission.to_csv("aggregated_submission_final.csv")