# SVM for miRNA classification

In [178]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np

In [145]:
df = pd.read_csv('feature_df_no_other.tsv', sep='\t')

In [146]:
set(df['rna_type'])

{'lncRNA', 'miRNA', 'miscRNA', 'rRNA', 'snoRNA'}

In [15]:
for col in df.columns:
    if len(set(df[col])) < 5:
        print(col, set(df[col]))

realMicRNA {'Null'}
realMicRNAName {'Null'}
head_minus3_templateNucleotide {'G', 'T', 'C', 'A'}
head_minus3_A_percentage {0.0, 1.0, 0.8, 0.10526315789473684}
head_minus3_T_percentage {0.0, 1.0}
head_minus3_C_percentage {0.0, 1.0}
head_minus3_G_percentage {0.0, 0.8947368421052632, 0.2, 1.0}
head_minus2_templateNucleotide {'G', 'T', 'A', 'C'}
head_minus2_A_percentage {0.0, 1.0}
head_minus2_G_percentage {0.0, 0.8125, 1.0}
head_minus1_templateNucleotide {'G', 'T', 'A', 'C'}
head_minus1_C_percentage {0.0, 1.0, 0.5}
head_minus1_G_percentage {0.0, 1.0, 0.5, 0.25}
tail_plus1_templateNucleotide {'G', 'T', 'A', 'C'}
tail_plus2_templateNucleotide {'G', 'T', 'C', 'A'}
tail_plus3_templateNucleotide {'G', 'T', 'A', 'C'}
tail_plus4_templateNucleotide {'G', 'T', 'A', 'C'}
tail_plus4_A_percentage {0.0, 1.0, 0.35714285714285715}
tail_plus4_C_percentage {0.0, 1.0, 0.67, 0.6428571428571429}
tail_plus4_G_percentage {0.0, 1.0, 0.6666666666666666, 0.7037037037037037}
tail_plus5_templateNucleotide {'G', 'T', 

Remove junk columns: the name of the cluster, as well as any column that has one unique value

In [147]:
df_clean_columns = df[[col for col in df.columns if np.logical_and.reduce([len(set(df[col])) > 1, col != 'clusterName', 'template' not in col])]]

In [148]:
df = df_clean_columns

In [149]:
categorical_df = df[[col for col in df.columns if df[col].dtype == 'O']]

In [150]:
categorical_vars = categorical_df.columns.drop('rna_type')

In [151]:
df = df.drop(categorical_vars, axis=1).merge(pd.get_dummies(df_clean_columns[categorical_vars]), left_index=True, right_index=True)

Cross validate as they did in their paper

In [157]:
df_feat = df.drop('rna_type', axis=1)

In [158]:
df_target = df['rna_type']

In [3]:
import numpy as np

In [5]:
test = np.array([[1, 2], [3, 4]])

In [7]:
np.save('testing', test)

In [159]:
X_train, X_test, y_train, y_test = train_test_split(
                        df_feat, np.ravel(df_target),
                test_size = 0.20, random_state = 101)

In [160]:
model = svm.SVC()
model.fit(X_train, y_train)

SVC()

In [174]:
# defining parameter range
param_range = 10**np.arange(-4, 4, dtype=float)
param_grid = {'C': param_range,
              'gamma': param_range,
              'kernel': ['rbf']}

grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END .............C=0.0001, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV 2/5] END .............C=0.0001, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV 3/5] END .............C=0.0001, gamma=0.0001, kernel=rbf; total time=   0.7s
[CV 4/5] END .............C=0.0001, gamma=0.0001, kernel=rbf; total time=   0.8s
[CV 5/5] END .............C=0.0001, gamma=0.0001, kernel=rbf; total time=   0.7s
[CV 1/5] END ..............C=0.0001, gamma=0.001, kernel=rbf; total time=   0.7s
[CV 2/5] END ..............C=0.0001, gamma=0.001, kernel=rbf; total time=   0.7s
[CV 3/5] END ..............C=0.0001, gamma=0.001, kernel=rbf; total time=   0.7s
[CV 4/5] END ..............C=0.0001, gamma=0.001, kernel=rbf; total time=   0.7s
[CV 5/5] END ..............C=0.0001, gamma=0.001, kernel=rbf; total time=   0.7s
[CV 1/5] END ...............C=0.0001, gamma=0.01, kernel=rbf; total time=   0.7s
[CV 2/5] END ...............C=0.0001, gamma=0.0

GridSearchCV(estimator=SVC(),
             param_grid={'C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'gamma': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ['rbf']},
             verbose=3)

In [175]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 1.0, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(gamma=0.0001)


In [176]:
grid_predictions = grid.predict(X_test)

In [179]:
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

      lncRNA       0.41      0.31      0.35       358
       miRNA       0.59      0.88      0.70       441
     miscRNA       0.00      0.00      0.00         7
        rRNA       0.00      0.00      0.00         3
      snoRNA       0.00      0.00      0.00       115

    accuracy                           0.54       924
   macro avg       0.20      0.24      0.21       924
weighted avg       0.44      0.54      0.47       924



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(confusion_matrix())