# Make a reasonable train test split
We want to hold out certain annotations from the test set to see how the model would generalize to unannotated miRNAs

In [1]:
import pandas as pd
import numpy
import re
import joblib
from sklearn.metrics import classification_report, confusion_matrix

I think just chromosomes 1, 8, and 14 would be good. That way, you don't have to annotate negative examples, and you don't have to worry about paralogs. 

In [3]:
df = pd.read_csv('SVM_training/all_tissues_mapped_dataset_15_refined_miRGeneDB_total_features_updated.csv')

In [4]:
df['chrom'] = [re.search('(?<=chr)[^:]*', cn).group(0) for cn in df['clusterName']]

In [5]:
chrom_list = ['2', '5', '7', '19']

In [6]:
df[df['chrom'].isin(chrom_list)]

Unnamed: 0,realMicRNA,realMicRNAName,clusterName,seqCount,readCountSum,exactMatchRatio,headUnstableLength,tailUnstableLength,head_minus3_templateNucleotide,head_minus3_TemplateNucleotide_percentage,...,hairpin_count,binding_count,interiorLoopCount,apicalLoop_size,stem_length,mFE,count_bindings_in_miRNA,UGU_UGUG_motif,pair_state,chrom
96,1,hsa-miR-199a-3p,mapped_mirna_ERR038410:miRCluster_471_24:chr19...,558,52782,0.728999,2,4,T,0.0,...,1,33,7,3,33,-42.4,18,No,Yes,19
97,1,hsa-miR-199a-5p,mapped_mirna_ERR038410:miRCluster_472_23:chr19...,20,150,0.766667,0,1,C,0.0,...,1,33,7,3,33,-42.4,18,No,Yes,19
98,1,hsa-miR-24-3p,mapped_mirna_ERR038410:miRCluster_474_23:chr19...,152,4972,0.207361,1,3,C,0.0,...,1,30,7,6,30,-38.1,17,Yes,No,19
99,1,hsa-miR-27a-3p,mapped_mirna_ERR038410:miRCluster_476_21:chr19...,107,1341,0.379567,0,6,G,0.0,...,1,32,5,11,32,-47.9,18,No,No,19
100,1,hsa-miR-23a-3p,mapped_mirna_ERR038410:miRCluster_477_24:chr19...,189,3446,0.489553,1,7,C,0.0,...,1,29,6,10,29,-35.7,15,Yes,No,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24058,-1,Null,mapped_nonMirna_SRR944034:miRCluster_4012_23:c...,3,12,0.166667,0,7,A,0.0,...,2,25,4,4,25,-18.2,11,No,No,7
24059,-1,Null,mapped_nonMirna_SRR944034:miRCluster_4231_22:c...,4,61,1.000000,1,0,G,0.0,...,3,25,2,11,25,-32.7,18,Yes,No,7
24060,-1,Null,mapped_nonMirna_SRR944034:miRCluster_4246_19:c...,3,20,0.100000,3,0,G,1.0,...,1,34,9,6,34,-52.9,12,No,No,7
24061,-1,Null,mapped_nonMirna_SRR944034:miRCluster_4253_18:c...,18,276,0.210145,0,4,C,0.0,...,1,11,2,4,11,-5.6,11,No,No,7


In [7]:
test = df[df['chrom'].isin(chrom_list)].drop('chrom', axis=1)
train = df[~df['chrom'].isin(chrom_list)].drop('chrom', axis=1)

Francois says: don't mix seen and unseen annotations in the test set. Just do unseen annotations in the test set.

In [78]:
test.to_csv('SVM_training/hold_out_2_5_7_19_test.csv', index=None)

In [79]:
train.to_csv('SVM_training/hold_out_2_5_7_19_train.csv', index=None)

# Load the model

In [2]:
sc, clf, selectFeatureNameList = joblib.load('SVM_training/hold_out_2_5_7_19.pkl')

# Preprocess the data as it's preprocessed in the SVM training script (model_building.py)

In [8]:
test.drop(test.columns[[1, 2]], axis=1, inplace=True)
test = pd.get_dummies(test)

In [9]:
featureListSelected = []                                                                                                                                                                                                                                              
with open('SVM_training/selected_features.txt', 'r') as inf:                                                                                                                                                                                                                                        
    line = inf.readline()                                                                                                                                                                                                                                             
    line = inf.readline()                                                                                                                                                                                                                                             
    while line != '':                                                                                                                                                                                                                                                 
        featureListSelected.append((float(line.strip().split('\t')[1]), int(                                                                                                                                                                                          
            line.strip().split('\t')[2]), line.strip().split('\t')[0]))                                                                                                                                                                                               
        line = inf.readline()

In [10]:
totalfeatureList = test.columns.values.tolist()                                                                                                                                                                                                                       
subIndexList = [totalfeatureList.index(                                                                                                                                                                                                                               
    item[2]) for item in featureListSelected]                                                                                                                                                                                                                         
X_test = test.iloc[:, subIndexList].values                                                                                                                                                                                                                            
y_test = test.iloc[:, 0].values                                                                                                                                                                                                                                       
X_test_std = sc.transform(X_test)

In [11]:
hold_out_predictions = clf.predict(X_test_std)

In [12]:
print(classification_report(y_test, hold_out_predictions))

              precision    recall  f1-score   support

          -1       0.98      0.99      0.98      2655
           1       0.98      0.98      0.98      2310

    accuracy                           0.98      4965
   macro avg       0.98      0.98      0.98      4965
weighted avg       0.98      0.98      0.98      4965



In [31]:
set([re.search('chr[0-9]*', name).group(0) for name in train['clusterName']])

{'chr',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr2',
 'chr20',
 'chr21',
 'chr22',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr9'}

In [32]:
selectFeatureNameList

[(0.37, 58, 'count_bindings_in_miRNA'),
 (0.168, 2, 'exactMatchRatio'),
 (0.146, 101, 'pair_state_No'),
 (0.146, 57, 'mFE'),
 (0.12, 5, 'head_minus3_TemplateNucleotide_percentage'),
 (0.112, 52, 'hairpin_count'),
 (0.109, 56, 'stem_length'),
 (0.095, 50, 'distanceToloop'),
 (0.079, 51, 'percentage_PairedInMiRNA'),
 (0.079, 3, 'headUnstableLength'),
 (0.078, 102, 'pair_state_Yes'),
 (0.064, 26, 'tail_plus2_A_percentage'),
 (0.064, 10, 'head_minus2_TemplateNucleotide_percentage'),
 (0.06, 53, 'binding_count'),
 (0.059, 21, 'tail_plus1_A_percentage'),
 (0.038, 97, 'armType_loop'),
 (0.038, 31, 'tail_plus3_A_percentage'),
 (0.037, 40, 'tail_plus5_TemplateNucleotide_percentage'),
 (0.034, 20, 'tail_plus1_TemplateNucleotide_percentage'),
 (0.033, 54, 'interiorLoopCount'),
 (0.033, 15, 'head_minus1_TemplateNucleotide_percentage')]