In [1]:
import sys
import os
import glob
import numpy as np
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import plotly
from scipy import stats
from scipy.optimize import curve_fit
import itertools
from collections import Counter
from Bio.Seq import Seq
import one_rep
import one_rep_no_syn
import function_bio_rep
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [2]:
def mutation_matrix(dfs):
    '''
    Takes the mutations from the amino_acids attribute
    (given in a dataframe with respective errors) and turns
    it into a panda dataframe (which can be turned into a matrix).
    '''
    aa_amalg = pd.DataFrame()
    for ind, df in enumerate(dfs):
        aa = [x[1] for x in df.index]
        mean = list(df['mean'])
        aa_df = pd.DataFrame({'Amino Acid': aa, 'Mean'+str(ind): mean})
        aa_df = aa_df.set_index('Amino Acid')
        aa_amalg = pd.concat([aa_amalg, aa_df], axis = 1, join = 'outer')
    return(aa_amalg)

def mutation_matrix_rep(dfs):
    '''
    Takes the mutations from the amino_acids attribute
    (given in a dataframe with respective errors) and turns
    it into a panda dataframe (which can be turned into a matrix).
    '''
    aa_amalg = pd.DataFrame()
    for ind, df in enumerate(dfs):
        aa = [x for x in df.index]
        mean = list(df['ratio'])
        aa_df = pd.DataFrame({'Amino Acid': aa, 'Mean'+str(ind): mean})
        aa_df = aa_df.set_index('Amino Acid')
        aa_amalg = pd.concat([aa_amalg, aa_df], axis = 1, join = 'outer')
    return(aa_amalg)

In [3]:
# import csv detailing the sets and conditions 
samples = pd.read_csv('sample_spreadsheet_final.csv')
threshold = 1
wt_3CL = Seq(function_bio_rep.mutations.seq_3CL).translate()
amino_acid_list = ['*', 'A', 'C', 'D', 'E', 'F', 'G', 'H',
                   'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
                   'S', 'T', 'V', 'W', 'Y']
amino_acid_list.reverse()
sets = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18, \
       19,20,21,'R1', '8R', '13R1', '14R', '13R2', '16R',\
       '9R', '10R']# 

### Generate and save data for biological replicates

In [8]:
# For every set compute the single set foldchange between gal/glu
thresh1 = 30
thresh2 = 1
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gal'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = one_rep_no_syn.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, threshold, sequence)
    for z in [0, 1]:
        count_mat = sites.count_matrix(first_files[z], \
                                         sequence, position, thresh1)
#         y = mutation_matrix_rep(comp)
#         if 'X' in y.index:
#             y = y.drop(index = 'X')
#         if not os.path.exists('single_replicates_nosyn'):
#             os.makedirs('single_replicates_nosyn')
#         y.to_csv('single_replicates_nosyn/set'+ str(x) + '_replicate' + str(z) + '_gal_glu' + '.csv')
        if not os.path.exists('count_matrix_glu_gal_nosyn'):
            os.makedirs('count_matrix_glu_gal_nosyn')
        for ind, y in enumerate(count_mat):
            y.to_csv('count_matrix_glu_gal_nosyn/set'+ str(x)+'_rep_'+str(z)+'residue'+str(start+ind)+'.csv')


1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 1 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 1 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 1 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 1 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 1 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 1 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115

In [6]:
# For every set compute the single set foldchange between gc/gal

for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gc'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gal'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = one_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, threshold, sequence)
    for z in [0, 1]:
        comp = sites.amino_acids([first_files[z]], [last_files[z]], \
                                         sequence, position, threshold)
        y = mutation_matrix_rep(comp)
        if 'X' in y.index:
            y = y.drop(index = 'X')
        if not os.path.exists('single_replicates_gc_012020'):
            os.makedirs('single_replicates_gc_012020')
        y.to_csv('single_replicates_gc_012020/set'+ str(x) + '_replicate' + str(z) + '_gc' + '.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 1 TACAAAATG


TypeError: amino_acids() missing 1 required positional argument: 'thresh2'

In [5]:
# For every set compute the single set foldchange between grl/gal

for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Grl'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gal'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = one_rep_no_syn.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, threshold, sequence)
    for z in [0, 1]:
        comp = sites.amino_acids([first_files[z]], [last_files[z]], \
                                         sequence, position, thresh1, thresh2)
        y = mutation_matrix_rep(comp)
        if 'X' in y.index:
            y = y.drop(index = 'X')
        if not os.path.exists('single_replicates_grl_nosyn'):
            os.makedirs('single_replicates_grl_nosyn')
        y.to_csv('single_replicates_grl_nosyn/set'+ str(x) + '_replicate' + str(z) + '_grl' + '.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 1 CCATCT


KeyboardInterrupt: 