## Import modules

In [1]:
from common_dirs_fns import *
from propy_functions import *
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl

  from pandas import Panel


In [2]:
# Specify matplotlib formatting
%matplotlib inline
mpl.rcParams['svg.fonttype']='none'
mpl.rcParams['font.sans-serif'] = 'Arial'
mpl.rcParams['axes.linewidth'] = 1.
mpl.rcParams['mathtext.default'] = 'regular'
plt.rcParams.update({'font.size': 10})

## Import sort-seq information and save peptides that passed quality control to a FASTA file

In [3]:
# Import stats_table
stats_table = pd.read_csv(analysis_path+'stats_table_clean.csv', index_col=0, header=0, na_filter=False)

# Save peptides to fasta file
to_fasta(stats_table[stats_table.index!='*'].reset_index(),
         'Translation',
         'Translation',
         analysis_path+'stats_table.fasta')

## Create output files containing propy-generated features for each peptide
Peptides that are 10 amino acids in length or shorter are excluded to enable calculation of PAAC/APAAC with lambda=10 (propy default). This criterion excludes 3 human AMP library peptides from downstream analysis.

In [4]:
# This code block takes about 2 hours to run for ~ 3800 peptides
input_fasta = analysis_path+'stats_table.fasta'

# Generate text files for each sequence in the input fasta file,
# which contains all peptide sequences in stats_table
propy_output_from_fasta(input_fasta, propy_path, overwrite=False)

HBox(children=(IntProgress(value=0, max=3797), HTML(value='')))




In [5]:
# Extract names of all peptides for which descriptors have been calculated
all_propy_peptides = pd.Index(os.listdir(propy_path)).map(lambda x: x.split('.txt')[0])

# Only include peptides that are also in stats_table.index
peptide_names = all_propy_peptides.intersection(stats_table.index)

In [6]:
# This takes a few minutes to run
# Save propy results to a new dataframe
peptide_features = generate_features_dataframe(peptide_names, propy_path)

HBox(children=(IntProgress(value=0, max=3495), HTML(value='')))




In [7]:
# Add calculated fold change information to peptide_features dataframe
peptide_features['FoldChange'] = stats_table.reindex(peptide_features.index)['Fold Change_Simple']

# Remove columns that contain NaN values, which indicate that
# that feature was not able to be calculated for all peptides
# (This eliminates correlation features with large lag parameters)
peptide_features = peptide_features.dropna(axis=1)

In [8]:
# Save results to tab-separated value file
# Several peptide features have commas which makes them incompatible with csv file type
peptide_features.to_csv(analysis_path + 'peptide_features.tsv', sep='\t')