This notebook includes:
- Mapping of UniProt IDs to Entrez Gene IDs of EV proteins (of combined EV database dataset)
- Merging of EV proteins with feature dataset to create Exosome/Microvesicle annotation
- Removal of low count proteins (<3)
- Generate Fasta files for STREME
- Add de novo discovered motifs as features

Output dataset:
- training_data_exo.csv
- training_data_mv.csv

# Import libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from os import listdir
from os.path import isfile, join

# Define paths

In [3]:
# user = 'kathi'
user = 'DeaGogishvili'

Data_path = '/Users/' + user + '/Documents/GitHub/ExtracellularVesicles/Data'

# Import datasets

### Mapped exosome & microvesicle datasets

This mapping dataset was produced in the "01_DataCuration" Jupyter notebook.

In [4]:
# Exocarta + Vesiclepedia
exo_mapped = Data_path + '/Curated/VP_exosomes_mapped.csv'  
exo_mapped = pd.read_csv(exo_mapped, sep=',', engine='python')

mv_mapped = Data_path + '/Curated/VP_microvesicles_mapped.csv'  
mv_mapped = pd.read_csv(mv_mapped, sep=',', engine='python')

### Feature dataset

In [5]:
df_features = Data_path + '/Curated/features_human_proteome.csv' 
df_features = pd.read_csv(df_features, sep=',', engine='python')

# Create feature data sets

### Exosome/Microvesicle annotation

In [6]:
exo_list = list(exo_mapped["Uniprot"])
mv_list = list(mv_mapped["Uniprot"])

In [7]:
# create copies of feature dataset
features_exo = df_features.copy()
features_mv = df_features.copy()

# Create exosome and microvesicle column (1/0) to categorize proteins found in EVs according to Vesiclepedia
features_exo['Exosome'] = np.where(features_exo['id'].isin(exo_list), 1, 0)
features_mv['Microvesicle'] = np.where(features_mv['id'].isin(mv_list), 1, 0)

# Check distribution in data set
print("Number of proteins:", len(df_features))
print("Number of exosome associated proteins:", (features_exo["Exosome"] == 1).sum())
print("Number of non-exosome associated proteins:", (features_exo["Exosome"] == 0).sum())
print("Number of microvesicle associated proteins:", (features_mv["Microvesicle"] == 1).sum())
print("Number of non-microvesicle associated proteins:", (features_mv["Microvesicle"] == 0).sum())

Number of proteins: 16790
Number of exosome associated proteins: 8775
Number of non-exosome associated proteins: 8015
Number of microvesicle associated proteins: 4738
Number of non-microvesicle associated proteins: 12052


### Removal of low count proteins from data sets

In [8]:
# set maximum number of occurrences that should be removed (inclusive)
low_count = 2

In [9]:
# create low count subset
exo_low_count = exo_mapped[exo_mapped['OCCURRENCES'] <= low_count].copy()
mv_low_count = mv_mapped[mv_mapped['OCCURRENCES'] <= low_count].copy()

In [10]:
#filter out low count proteins

keep_exo = (features_exo['id'].isin(exo_low_count["Uniprot"]) == False)
df_features_exo = features_exo[keep_exo]

keep_mv = (features_mv['id'].isin(mv_low_count["Uniprot"]) == False)
df_features_mv = features_mv[keep_mv]

In [11]:
# Check new distribution in data set
print("Number of low count proteins in exosomes:", (keep_exo == False).sum(), "(REMOVED)")
print("Number of exosome associated proteins:", (df_features_exo["Exosome"] == 1).sum())
print("Number of non-exosome associated proteins:", (df_features_exo["Exosome"] == 0).sum())
print("Number of total proteins:", (len(df_features_exo)))
print("---------------")
print("Number of low count proteins in microvesicles:", (keep_mv == False).sum(), "(REMOVED)")
print("Number of microvesicle associated proteins:", (df_features_mv["Microvesicle"] == 1).sum())
print("Number of non-microvesicle associated proteins:", (df_features_mv["Microvesicle"] == 0).sum())
print("Number of total proteins:", (len(df_features_mv)))

Number of low count proteins in exosomes: 3994 (REMOVED)
Number of exosome associated proteins: 4781
Number of non-exosome associated proteins: 8015
Number of total proteins: 12796
---------------
Number of low count proteins in microvesicles: 1745 (REMOVED)
Number of microvesicle associated proteins: 2993
Number of non-microvesicle associated proteins: 12052
Number of total proteins: 15045


# Save final feature data sets

In [12]:
df_features_exo = df_features_exo.drop(columns=['fasta_sequence']) 
df_features_mv = df_features_mv.drop(columns=['fasta_sequence']) 

df_features_exo.to_csv(Data_path + '/MachineLearning/training_data_exo.csv', index=False)
df_features_mv.to_csv(Data_path + '/MachineLearning/training_data_mv.csv', index=False)

# Microvesicles 
### Generate FASTA files for STREME

In [49]:
# # all
# all_data_mv = df_features_mv[['id', 'fasta_sequence']]
# all_data_mv['id'] = '>'+all_data_mv['id']

# # change U to C, otherwise STREME doesn't work
# sequences = list(all_data_mv['fasta_sequence'])
# sequences = [sequence.replace('U', 'C') for sequence in sequences]
# all_data_mv['fasta'] = sequences
# all_data_mv = all_data_mv[['id', 'fasta']]

# #all_data_mv.to_csv(Data_path + '/STREME/MVs/all_data_mv.FASTA', sep='\n', header=False, index=False)

In [50]:
# # positive

# positive_mv = df_features_mv[df_features_mv['Microvesicle'] == 1]
# positive_mv = positive_mv[['id', 'fasta_sequence']]
# positive_mv['id'] = '>'+positive_mv['id']

# # change U to C
# sequences = list(positive_mv['fasta_sequence'])
# sequences = [sequence.replace('U', 'C') for sequence in sequences]
# positive_mv['fasta'] = sequences
# positive_mv = positive_mv[['id', 'fasta']]

# #positive_mv.to_csv(Data_path + '/STREME/MVs/positive_mv.FASTA', sep='\n', header=False, index=False)

In [51]:
# # negative
# negative_mv = df_features_mv[df_features_mv['Microvesicle'] == 0]
# negative_mv = negative_mv[['id', 'fasta_sequence']]
# negative_mv['id'] = '>'+negative_mv['id']

# # change U to C
# sequences = list(negative_mv['fasta_sequence'])
# sequences = [sequence.replace('U', 'C') for sequence in sequences]
# negative_mv['fasta'] = sequences
# negative_mv = negative_mv[['id', 'fasta']]

# #negative_mv.to_csv(Data_path + '/STREME/MVs/negative_mv.FASTA', sep='\n', header=False, index=False)