# HOT Region Classification

In [15]:
# File names and parameters
input_hot = 'hot_regions_05_any.fa'
input_lot = 'lot_regions_05_any.fa'
kmer_length = 5
sampling_percentage = 0.05
seed = 35

In [2]:
from Bio.SeqIO.FastaIO import SimpleFastaParser

# Parsing Fasta files and building lists of sequences
with open(input_hot) as fasta_file:
    hot_sequences = []
    for title, sequence in SimpleFastaParser(fasta_file):
        hot_sequences.append(sequence)

with open(input_lot) as fasta_file:
    lot_sequences = []
    for title, sequence in SimpleFastaParser(fasta_file):
        lot_sequences.append(sequence)

In [3]:
import pandas as pd

# Building dataframe of sequences with labels
df_hot = pd.DataFrame(hot_sequences, columns=['sequence'])
df_hot['label'] = 1

df_lot = pd.DataFrame(lot_sequences, columns=['sequence'])
df_lot['label'] = 0

df1 = pd.concat([df_hot, df_lot])
df1.label.value_counts()

1    129356
0    123795
Name: label, dtype: int64

In [4]:
from sklearn.utils import resample

# downsampling majority class
df_majority = df_hot.sample(n=int(df_hot.shape[0]*sampling_percentage), random_state=seed)
df_minority = df_lot.sample(n=int(df_lot.shape[0]*sampling_percentage), random_state=seed)

df_majority_downsampled = resample(df_majority, replace=False, n_samples=df_minority.shape[0], random_state=seed)

df_resample = pd.concat([df_majority_downsampled, df_minority])
df_resample.label.value_counts()

1    6189
0    6189
Name: label, dtype: int64

In [5]:
# Helper function for extracting kmers from a sequence
def kmers(seq, length):
    return [seq[i:i+length].lower() for i in range(len(seq)-length+1)]

# Replacing sequences with kmers
df_resample['kmers'] = df_resample.apply(lambda a: kmers(a['sequence'], kmer_length), axis=1)
df_resample.drop('sequence',axis=1, inplace=True)
df_resample = df_resample.reset_index()
df_resample.drop('index', axis=1, inplace=True)
df_resample.head()

Unnamed: 0,label,kmers
0,1,"[tcaaa, caaaa, aaaaa, aaaag, aaagc, aagcc, agc..."
1,1,"[cacta, actac, ctacc, taccc, accca, cccag, cca..."
2,1,"[gatct, atctc, tctca, ctcat, tcatg, catgt, atg..."
3,1,"[acaga, cagaa, agaaa, gaaac, aaacc, aaccc, acc..."
4,1,"[cggtg, ggtgt, gtgtc, tgtca, gtcat, tcatt, cat..."


In [6]:
# Converting list of kmers into sentences for TfidfVectorizer
kmer_sentences = list(df_resample['kmers'])
for x in range(len(kmer_sentences)):
    kmer_sentences[x] = ' '.join(kmer_sentences[x])

# Grabbing labels from dataframe
labels = df_resample.iloc[:,0].values

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MaxAbsScaler

# Vectorizing the kmer sentences
c = CountVectorizer(ngram_range=(kmer_length,kmer_length))
x = c.fit_transform(kmer_sentences)

x_normalized = normalize(x, norm='l1')
scaler = MaxAbsScaler()
x_scaled = scaler.fit_transform(x_normalized)
pd.DataFrame(x_scaled.toarray()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260655,260656,260657,260658,260659,260660,260661,260662,260663,260664
0,0.020915,0.0,0.0,0.104527,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_validate

clf = SVC(C=1.0, kernel='linear', gamma=0.05)

# Making train and sets (For singly trained models only)
X_train, X_test, Y_train, Y_test = train_test_split(x_scaled, labels, test_size = 0.15, random_state=seed)

In [9]:
# # Singly trained SVM model

# clf.fit(X_train, Y_train)
# Y_pred = clf.predict(X_test)
# print('Accuracy using SVM =', accuracy_score(Y_test, Y_pred))

# # Print Classification Report
# print(classification_report(Y_test, Y_pred))

# import matplotlib.pyplot as plt
# import seaborn as sns

# # Plot Confusion matrix heatmap
# cf = confusion_matrix(Y_test, Y_pred)
# ax = plt.subplot()
# sns.heatmap(cf, annot=True, fmt='d', ax=ax, linewidths=5, cmap="Blues", center=0)
# ax.set_xlabel('Predicted Labels')
# ax.set_ylabel('True Labels')
# ax.set_title('Confusion Matrix for SVM')
# ax.xaxis.set_ticklabels(['Negative', 'Positive'])
# ax.yaxis.set_ticklabels(['Negative', 'Positive'])
# plt.show()

In [10]:
# Cross Validated SVM Model

scores = cross_validate(clf, x, labels, cv=5, scoring=['accuracy','precision','recall','f1'])
print("Cross validation produces an average of:")
print("\t%0.2f accuracy" % scores['test_accuracy'].mean())
print("\t%0.2f precision" % scores['test_precision'].mean())
print("\t%0.2f recall" % scores['test_recall'].mean())
print("\t%0.2f f1" % scores['test_f1'].mean())

Cross validation produces an average of:
	0.75 accuracy
	0.74 precision
	0.76 recall
	0.75 f1


In [11]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 300)

In [12]:
# # Singly trained Random Forest model
# clf.fit(X_train, Y_train)
# Y_pred = clf.predict(X_test)
# print('Accuracy using Random Forest =', accuracy_score(Y_test, Y_pred))

# # Print Classification Report
# print(classification_report(Y_test, Y_pred))

In [13]:
# Cross Validated Random Forest Model

scores = cross_validate(clf, x, labels, cv=5, scoring=['accuracy','precision','recall','f1'])
print("Cross validation produces an average of:")
print("\t%0.2f accuracy" % scores['test_accuracy'].mean())
print("\t%0.2f precision" % scores['test_precision'].mean())
print("\t%0.2f recall" % scores['test_recall'].mean())
print("\t%0.2f f1" % scores['test_f1'].mean())

Cross validation produces an average of:
	0.69 accuracy
	0.81 precision
	0.50 recall
	0.62 f1
