In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from collections import Counter
from itertools import chain
import os

%matplotlib inline

In [None]:
from dtu_denovo_sequencing.utils.dataset import load_all

# Dataset V1

### Loading dataset:

In [None]:
path = '../data/denovo_dataset_v1/'
df = load_all(path)
df.shape

In [None]:
df.head(3)

## Dataset distributions
Sequence length and Spectrum length

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(13,4))

seq_len = df['Sequence'].map(len)
ax[0].hist(seq_len, bins=np.arange(seq_len.min(),40)-0.4, rwidth=0.1)
ax[0].set_xlabel('Sequence length')
ax[0].set_ylabel('Count')

spec_len = df['Mass values'].map(len)
ax[1].hist(spec_len, bins=np.arange(spec_len.min(),800)-0.4, rwidth=1)
ax[1].set_xlabel('Spectrum length')
ax[1].set_ylabel('Count')

plt.show()

Number of occurrences per sequence

In [None]:
unique_counts = df['Sequence'].value_counts()
plt.hist(unique_counts, bins=np.arange(1,30, 1)-0.4, rwidth=0.09)
plt.xlabel('Sequence occurrences')
plt.ylabel('Count')
plt.show()
f"{(unique_counts>30).sum()} out of {unique_counts.shape[0]} unique sequences have more than 30 occurrences"

m/z and retention time distributions

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(13,4))

mz = df['m/z']
ax[0].hist(mz, bins=np.arange(mz.min(),mz.max(), 5)-0.4, rwidth=1)
ax[0].set_xlabel('m/z')
ax[0].set_ylabel('Count')

rt = df['Retention time']
ax[1].hist(rt, bins=np.arange(rt.min(),rt.max(), 0.25)-0.4, rwidth=1)
ax[1].set_xlabel('Retention time')
ax[1].set_ylabel('Count')

plt.show()

Amino acid frequency

In [None]:
unique_aa = Counter(chain.from_iterable(df['Sequence']))

plt.bar(unique_aa.keys(), unique_aa.values(), color=plt.colormaps.get('viridis')(np.arange(len(unique_aa))/len(unique_aa)))
plt.ylabel('Count')
plt.title('Amino acid frequency')
plt.show()

## Amino acid oxidation modifications

In [None]:
ox_count = df['Modified sequence'].map(lambda x: x.count('(ox)'))
f"Total oxidation count: {ox_count.sum()}"

In [None]:
plt.hist(ox_count, bins=np.arange(0,ox_count.max()+1, 1)-0.5, rwidth=0.5)
plt.title('Oxidation occurrences per sample')
plt.ylabel('Count')
plt.show()

In [None]:
ox_pos = df['Modified sequence'].map(lambda x: np.array([m.start() for m in re.finditer('(ox)', x)]))

In [None]:
all_prepends = []
for pos, seq in zip(ox_pos, df['Modified sequence']):
    if len(pos) == 0:
        continue
    all_prepends += list(np.array(list(seq))[pos-2])

In [None]:
np.unique(all_prepends, return_counts=True)

We only have M as an oxidation prepend

# Dataset V2

### Loading dataset:

ToDo...