<a href="https://colab.research.google.com/github/katarinagresova/DSIB01_2020/blob/main/Kmer_Logit_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [9]:
!pip install biopython

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/76/02/8b606c4aa92ff61b5eda71d23b499ab1de57d5e818be33f77b01a6f435a8/biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 5.3MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.78


In [32]:
import csv
import itertools
from Bio import SeqIO
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import metrics

Mount Google Drive to use stored fasta file from lecture.

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# K-mer preparation

In [3]:
def get_all_possible_kmers(k, bases):
  return [''.join(p) for p in itertools.product(bases, repeat=k)]

In [4]:
def get_kmer_frequencies(seq, possible_kmers):
  kmer_counts = {kmer : 0 for kmer in possible_kmers}
  k = len(possible_kmers[0])

  for i in range(len(seq) - k + 1):
    kmer = seq[i:i+k]
    kmer_counts[kmer] += 1

  kmer_freqs = [count / len(seq) for count in kmer_counts.values()]
  return kmer_freqs

In [12]:
def create_kmer_file(positive_seqs_file, negative_seqs_file, out_file, k=3):

  bases = ['G', 'C', 'A', 'T']
  possible_kmers = get_all_possible_kmers(k, bases)

  with open(out_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Categ'] + possible_kmers)

    for record in SeqIO.parse(positive_seqs_file, "fasta"):
      kmer_frequencies = get_kmer_frequencies(record.seq, possible_kmers)
      writer.writerow(['1'] + kmer_frequencies)

    for record in SeqIO.parse(negative_seqs_file, "fasta"):
      kmer_frequencies = get_kmer_frequencies(record.seq, possible_kmers)
      writer.writerow(['0'] + kmer_frequencies)

In [13]:
positive_seqs_file = '/content/drive/My Drive/DSIB01/hw3/PUM2_10K_window-50_with_sequences.fa'
negative_seqs_file = '/content/drive/My Drive/DSIB01/hw3/neg_11_5K_window-50_with_sequences.fa'
out_file = 'kmers.csv'
create_kmer_file(positive_seqs_file, negative_seqs_file, out_file)

In [25]:
def load_data(data_file):
  df = pd.read_csv(data_file)
  x = df.iloc[:, 1:]
  y = df.iloc[:, 0]
  return x, y

In [None]:
x, y = load_data(out_file)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Logistic regression model

In [28]:
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

predictions = logisticRegr.predict(x_test)

In [31]:
print(metrics.accuracy_score(y_test, predictions))
print(metrics.precision_score(y_test, predictions))
print(metrics.recall_score(y_test, predictions))

0.8397480755773268
0.84185303514377
0.802130898021309


# SVM

In [33]:
linearSvc = LinearSVC()
linearSvc.fit(x_train, y_train)

predictions = linearSvc.predict(x_test)

In [34]:
print(metrics.accuracy_score(y_test, predictions))
print(metrics.precision_score(y_test, predictions))
print(metrics.recall_score(y_test, predictions))

0.8514112432936786
0.8424024640657084
0.832572298325723
