In [1]:
#Integrating data from original run (run 1) and CUX1 rescue from run 2
#File anon_allele_counts_resc_2 contains reads counts from all plates 
#using 40bp to identify amplicon and 100bp to call mutations (except for CUX1 and JP001 TET2b)

In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
#Import all the data

sourcefile = '../Data/Amp_data/anon_allele_counts_resc_2.tsv'

#Import the data
df = pd.read_csv(sourcefile, header = [0,1,2], index_col = 0, sep='\t')
df = df.stack([0,1])
df['Plate'] = df.index.get_level_values(1)
df['Well'] = df.index.get_level_values(0)
df['Amplicon'] = df.index.get_level_values(2)
df[['Patient', 'one', 'two']] = df['Amplicon'].str.split('_', expand = True)
df = df.drop(columns = ['one', 'two'])

#Create a dictionary to rename subscripted plates back to original name
#Plates in new run were given extra letter to distinguish them (a, plus b, c, d for AS-206 only with rescue of additional amplicons)
subs = ['a', 'b', 'c', 'd']
plate_key = df.loc[(df['Plate'].str.contains('a')) | (df['Plate'].str.contains('b')) | (df['Plate'].str.contains('c')) | (df['Plate'].str.contains('d'))]['Plate'].drop_duplicates().to_list()
plate_values = []
for p in plate_key:
    for s in subs:
        if s in p:
            q = p.split(s)[0]
        else:
            continue

    plate_values.append(q)

plate_rename = dict(zip(plate_key, plate_values))

In [4]:
#Select appropriate data source (plates given an extra letter in run2 so they can be distinguished)
#Expected number of rows (n) = (2 x PD7151 + 6 x PD7153 + 6 x JP001 Amplicons) x 9 plates x 384 wells
n = (2 + 6 + 6) * 9 * 384

#PD 7151 - TET2a and TET2b reads come from run 1
df1 = df.loc[(df['Patient'].isin(['PD7151'])) & (~df['Plate'].str.contains('a')) & (df['Amplicon'].isin(['PD7151_TET2a', 'PD7151_TET2b']))]
df1['Plate'].replace(plate_rename, inplace = True)  #rename plates
print('PD7151' , df1.shape[0], df1['Amplicon'].drop_duplicates().to_list())


#JP001 - all reads come from run 1

df2 = df.loc[(df['Patient'].isin(['JP001'])) & (~df['Plate'].str.contains('a'))] 
df2['Plate'].replace(plate_rename, inplace = True)  #rename plates
print('JP001', df2.shape[0], df2['Amplicon'].drop_duplicates().to_list())

#PD7153 - CUX1 reads from run 1 replaced with CUX1 from rescue

#CUX1 from run2
a = df.loc[(df['Patient'].isin(['PD7153'])) & (df['Plate'].str.contains('a')) & (df['Amplicon'].isin(['PD7153_CUX1']))]
#run1 except any rescue data
b = df.loc[(df['Patient'].isin(['PD7153'])) & (~df['Amplicon'].str.contains('PD7153_CUX1')) & (~df['Plate'].str.contains('a')) & (~df['Plate'].str.contains('b')) & (~df['Plate'].str.contains('c')) & (~df['Plate'].str.contains('d'))]
           
df3 = pd.concat([a, b])
df3['Plate'].replace(plate_rename, inplace = True) #rename plates
print('PD7153', df3.shape[0], df3['Amplicon'].drop_duplicates().to_list())

#Put everything back together and fix the indexes, then put into format to match existing code and save to csv
df_all = pd.concat([df1, df2, df3])
print('Expected number of rows = ', n, 'Actual number of rows = ', df_all.shape[0])
df_all.rename(index = plate_rename, inplace = True)
df_all.drop(columns = ['Plate', 'Well', 'Amplicon', 'Patient'], inplace = True)
df_all = df_all.stack()
df_all = df_all.reorder_levels([1, 0, 2, 3])
df_all = df_all.to_frame()
df_all.columns = ['Reads']
df_all.to_csv('../Data/Amp_data/clean_anon_allele_counts_resc_2.tsv', sep = '\t')

print('Any duplicate rows?', np.where(df_all.index.duplicated()))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


PD7151 6912 ['PD7151_TET2a', 'PD7151_TET2b']
JP001 20736 ['JP001_RUNX1_c', 'JP001_RUNX1_g', 'JP001_SRSF2', 'JP001_TET2a', 'JP001_TET2b_c', 'JP001_TET2b_g']
PD7153 20736 ['PD7153_CUX1', 'PD7153_SRSF2', 'PD7153_TET2a', 'PD7153_TET2b', 'PD7153_TGFB3_c', 'PD7153_TGFB3_g']
Expected number of rows =  48384 Actual number of rows =  48384
Any duplicate rows? (array([], dtype=int64),)
