In [1]:
import sys
import os
import glob
import numpy as np
import pandas as pd
import gzip
import matplotlib.pyplot as plt
import plotly
from scipy import stats
from scipy.optimize import curve_fit
import itertools
from collections import Counter
from Bio.Seq import Seq
import one_rep
import function_bio_rep
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn import preprocessing

In [2]:
def mutation_matrix(dfs):
    '''
    Takes the mutations from the amino_acids attribute
    (given in a dataframe with respective errors) and turns
    it into a panda dataframe (which can be turned into a matrix).
    '''
    aa_amalg = pd.DataFrame()
    for ind, df in enumerate(dfs):
        aa = [x[1] for x in df.index]
        mean = list(df['mean'])
        aa_df = pd.DataFrame({'Amino Acid': aa, 'Mean'+str(ind): mean})
        aa_df = aa_df.set_index('Amino Acid')
        aa_amalg = pd.concat([aa_amalg, aa_df], axis = 1, join = 'outer')
    return(aa_amalg)

def mutation_matrix_rep(dfs):
    '''
    Takes the mutations from the amino_acids attribute
    (given in a dataframe with respective errors) and turns
    it into a panda dataframe (which can be turned into a matrix).
    '''
    aa_amalg = pd.DataFrame()
    for ind, df in enumerate(dfs):
        aa = [x[1] for x in df.index]
        mean = list(df['ratio'])
        aa_df = pd.DataFrame({'Amino Acid': aa, 'Mean'+str(ind): mean})
        aa_df = aa_df.set_index('Amino Acid')
        aa_amalg = pd.concat([aa_amalg, aa_df], axis = 1, join = 'outer')
    return(aa_amalg)

In [4]:
# import csv detailing the sets and conditions 
samples = pd.read_csv('sample_spreadsheet_RSA.csv')
threshold = 1
wt_3CL = Seq(function_bio_rep.mutations.seq_3CL).translate()
amino_acid_list = ['*', 'A', 'C', 'D', 'E', 'F', 'G', 'H',
                   'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
                   'S', 'T', 'V', 'W', 'Y']
amino_acid_list.reverse()
sets = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18, \
       19,20,21,'R1', '8R', '13R1', '14R', '13R2', '16R',\
       '9R', '10R']# 

### Generate and save CSV for each condition pair

In [4]:
# For every set compute the foldchange between glu and gal conditions
thresh1 = 30
thresh2 = 1
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    second_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gal'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = function_bio_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, threshold, sequence)
    glu_gal_comp = sites.amino_acids(first_files, second_files, \
                                     sequence, position, thresh1, thresh2)
    y = mutation_matrix(glu_gal_comp)
    if 'X' in y.index:
        y = y.drop(index = 'X')
    if not os.path.exists('glu_gal_matrices'):
        os.makedirs('glu_gal_matrices')
    y.to_csv('glu_gal_matrices/set'+ str(x) + 'gal_glu' + '.csv')
    if not os.path.exists('amino_acid_glu_gal'):
        os.makedirs('amino_acid_glu_gal')
    for y, res in list(zip(glu_gal_comp, sites.sites)):
        y.to_csv('amino_acid_glu_gal/set'+str(x)+ '_residue'+str(res)+'.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 1 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 1 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 1 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 1 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 1 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 1 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115

In [6]:
# For every set compute the foldchange between gc and glu conditions
thresh1 = 30
thresh2 = 1
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gc'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = function_bio_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, threshold, sequence)
    glu_gal_comp = sites.amino_acids(first_files, last_files, \
                                     sequence, position, thresh1, thresh2)
    y = mutation_matrix(glu_gal_comp)
    if 'X' in y.index:
        y = y.drop(index = 'X')
    if not os.path.exists('glu_gc_matrices'):
        os.makedirs('glu_gc_matrices')
    y.to_csv('glu_gc_matrices/set'+ str(x) + 'glu_gc' + '.csv')
    for y, res in list(zip(glu_gal_comp, sites.sites)):
        y.to_csv('glu_gc_matrices/set'+str(x)+ '_residue'+str(res)+'.csv')
    if not os.path.exists('amino_acid_glu_gc'):
        os.makedirs('amino_acid_glu_gc')
    for y, res in list(zip(glu_gal_comp, sites.sites)):
        y.to_csv('amino_acid_glu_gc/set'+str(x)+ '_residue'+str(res)+'.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 1 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 1 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 1 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 1 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 1 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 1 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115

In [7]:
# For every set compute the foldchange between gc and glu conditions
thresh1 = 30
thresh2 = 1
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Grl'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = function_bio_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, threshold, sequence)
    glu_gal_comp = sites.amino_acids(first_files, last_files, \
                                     sequence, position, thresh1, thresh2)
    y = mutation_matrix(glu_gal_comp)
    if 'X' in y.index:
        y = y.drop(index = 'X')
    if not os.path.exists('glu_grl_matrices'):
        os.makedirs('glu_grl_matrices')
    y.to_csv('glu_grl_matrices/set'+ str(x) + 'glu_grl' + '.csv')
    for y, res in list(zip(glu_gal_comp, sites.sites)):
        y.to_csv('glu_grl_matrices/set'+str(x)+ '_residue'+str(res)+'.csv')
    if not os.path.exists('amino_acid_glu_grl'):
        os.makedirs('amino_acid_glu_grl')
    for y, res in list(zip(glu_gal_comp, sites.sites)):
        y.to_csv('amino_acid_glu_grl/set'+str(x)+ '_residue'+str(res)+'.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 1 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 1 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 1 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 1 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 1 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 1 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115

### Generate and save data for biological replicates

In [9]:
# For every set compute the single set foldchange between gal/glu
#try left combine so that even ones that don't appear in gal are counted

thresh1 = 30
thresh2 = 1
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gal'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = one_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, thresh1, thresh2, sequence)
    for z in [0, 1]:
        comp = sites.amino_acids([first_files[z]], [last_files[z]], \
                                         sequence, position, thresh1, thresh2)
        y = mutation_matrix_rep(comp)
        if 'X' in y.index:
            y = y.drop(index = 'X')
        if not os.path.exists('single_replicates'):
            os.makedirs('single_replicates')
        y.to_csv('single_replicates/set'+ str(x) + '_replicate' + str(z) + '_gal_glu' + '.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 30 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 30 1 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 30 1 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 30 1 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 30 1 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 30 1 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 30 1 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 11

In [10]:
# For every set compute the single set foldchange between gc/glu
thresh1 = 30
thresh2 = 1
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Gc'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = one_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, thresh1, thresh2, sequence)
    for z in [0, 1]:
        comp = sites.amino_acids([first_files[z]], [last_files[z]], \
                                         sequence, position, thresh1, thresh2)
        y = mutation_matrix_rep(comp)
        if 'X' in y.index:
            y = y.drop(index = 'X')
        if not os.path.exists('single_replicates_gc'):
            os.makedirs('single_replicates_gc')
        y.to_csv('single_replicates_gc/set'+ str(x) + '_replicate' + str(z) + '_gc' + '.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 30 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 30 1 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 30 1 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 30 1 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 30 1 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 30 1 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 30 1 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 11

In [11]:
# For every set compute the single set foldchange between grl/gal
thresh1 = 30
thresh2 = 1
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    first_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    last_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Grl'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = one_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, thresh1, thresh2, sequence)
    for z in [0, 1]:
        comp = sites.amino_acids([first_files[z]], [last_files[z]], \
                                         sequence, position, thresh1, thresh2)
        y = mutation_matrix_rep(comp)
        if 'X' in y.index:
            y = y.drop(index = 'X')
        if not os.path.exists('single_replicates_grl'):
            os.makedirs('single_replicates_grl')
        y.to_csv('single_replicates_grl/set'+ str(x) + '_replicate' + str(z) + '_grl' + '.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 30 1 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 30 1 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 30 1 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 30 1 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 30 1 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 30 1 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 30 1 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 11

### Glucose condition mutation composition

In [5]:
# For every set compute the foldchange between glu and gal conditions
threshold = 30
for s in sets:
    x = str(s)
    print(str(x))
    start = list(samples[samples['Set'] == x]['Start range'])[0]
    end = list(samples[samples['Set'] == x]['End range'])[0]
    glu_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    sequence = list(samples[samples['Set'] == x]['Sequence'])[0]
    position = list(samples[samples['Set'] == x]['Position'])[0]
    sites = function_bio_rep.mutations(list(range(start, end)), list(range(start, end)))
    print(sites.sites, sites.all_muts, position, threshold, sequence)
    for rep in [0, 1]:
        count_mat = sites.count_matrix(glu_files[rep], \
                                         sequence, position, threshold)
        for ind, y in enumerate(count_mat):
            if not os.path.exists('glu_count_matrices'):
                os.makedirs('glu_count_matrices')
            y.to_csv('glu_count_matrices/set'+ str(x)+'_rep_'+str(rep)+'residue'+str(start+ind)+'.csv')
#         y = y.drop(index = 'X')
#     y.to_csv('gal_grl_matrices/set'+ str(x) + 'gal_grl' + '.csv')

1
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] -2 20 TACAAAATG
2
[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 9 20 CCATCT
3
[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42] 24 20 ACAACT
4
[43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] 38 20 TGTCCAAGA
5
[58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] [57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] 54 20 TATGAAG
6
[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85] 68 20 GTACAG
7
[86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] [85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] 83 20 CAAAAT
8
[100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 1

### Count number of reads per set

In [12]:
# For every set compute the foldchange between glu and gal conditions
sets = ['9R', '10R']
reads = []
for s in sets:
    x = str(s)
    print(str(x))
    glu_files = list(samples[samples['Set'] == x]['Folder'] + \
        samples[samples['Set'] == x]['Glu'] + '_R1.fastq.gz')
    for rep in [0, 1]:
        count_mat = glu_files[rep]
        with gzip.open(count_mat, 'rb') as f:
            for i, l in enumerate(f):
                pass
        reads.append([x, rep,(i+1)/4])

9R
10R


In [13]:
pd.DataFrame(reads, columns = ['set', 'replicate', 'reads']).to_csv('total_reads.csv')