In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
# Assign raw data and plate diagram files from environment
RAW_SAMPLE_1 = "data/T1000_Int_Delayed_Ceph_0_50_cGy_mtDNA_Plate_2_02_24_23_JS_745 -  Quantification Cq Results.xlsx"
RAW_SAMPLE_1_DIAGRAM = "data/T1000_Int_Del_Ceph_qPCR_DIAGRAM_FORMATTED.xlsx"

# Read in the raw data and plate diagram as pandas dataframes
df_data = pd.read_excel(RAW_SAMPLE_1, sheet_name=0)
df_diagram = pd.read_excel(RAW_SAMPLE_1_DIAGRAM)

# Show first 5 rows
df_diagram.head()

Unnamed: 0.1,Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
0,A,,D49,D49 dup**,D73,D73 dup,D89,D89 dup,D168,D168 dup,D185,D185 dup,
1,B,,D53,D53 dup,D76,D76 dup,D92,D92 dup,D169*,D169 dup,D192,D192 dup,
2,C,,D56,D56 dup,D77,D77 dup,D149,D149 dup,D172,D172 dup,D193,D193 dup,
3,D,,D57,D57 dup,D80,D80 dup,D152,D152 dup,D173,D173 dup,D200,D200 dup,
4,E,,D60,D60 dup,D81,D81 dup,D153,D153 dup,D176,D176 dup,D17,D17 dup,


In [5]:
# Set the first column as the index, remove whitespace and add a space to the "dup" values
df_diagram = df_diagram.set_index(df_diagram.columns[0])
df_diagram = df_diagram.replace('\s+', '', regex=True)
df_diagram = df_diagram.replace('dup', ' dup', regex=True)

# Use the rows and columns (besides the first one) of the plate diagram to create a dictionary of corresponding Sample and Well IDs
sample_map = {}

for row in df_diagram.index:
    for col in df_diagram.columns[1:]:
        well_id = f"{row}{int(col):02d}"
        sample_name = df_diagram.loc[row, col]
        sample_map[well_id] = sample_name

# Read in the raw qPCR data and map the well IDs to sample names using the dictionary
df_data["Sample"] = df_data["Well"].map(sample_map)

# Show first 5 rows
df_data.head()

Unnamed: 0.1,Unnamed: 0,Well,Fluor,Target,Content,Sample,Biological Set Name,Cq,Cq Mean,Cq Std. Dev,Starting Quantity (SQ),Log Starting Quantity,SQ Mean,SQ Std. Dev,Set Point,Well Note
0,,A01,SYBR,,Unkn,,,,0.0,0,,,0.0,0,72,
1,,A02,SYBR,,Unkn,D49,,19.214765,19.214765,0,,,,0,72,
2,,A03,SYBR,,Unkn,D49 dup**,,19.586924,19.586924,0,,,,0,72,
3,,A04,SYBR,,Unkn,D73,,19.634851,19.634851,0,,,,0,72,
4,,A05,SYBR,,Unkn,D73 dup,,19.645373,19.645373,0,,,,0,72,


In [6]:
# Select relevant columns, make mtDNA1 & mtDNA2 columns, and drop NaN rows
df = df_data[['Well', 'Cq', 'Sample']]

df['mtDNA1'] = "mtDNA1"
df['mtDNA2'] = "mtDNA2"

df = df.loc[:,["Well", "Sample", "mtDNA1", "mtDNA2", "Cq"]]

df = df.dropna()

# Show first 5 rows
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mtDNA1'] = "mtDNA1"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mtDNA2'] = "mtDNA2"


Unnamed: 0,Well,Sample,mtDNA1,mtDNA2,Cq
1,A02,D49,mtDNA1,mtDNA2,19.214765
2,A03,D49 dup**,mtDNA1,mtDNA2,19.586924
3,A04,D73,mtDNA1,mtDNA2,19.634851
4,A05,D73 dup,mtDNA1,mtDNA2,19.645373
5,A06,D89,mtDNA1,mtDNA2,19.720434


In [7]:
# set mtDNA1 and mtDNA2 values to Cq values by treating mtDNA1 as the Cq for the first sample and mtDNA2 as the Cq for the duplicate sample if it exists as "Sample dup"

# Note, exactly "Sample dup" is used to avoid matching "Sample dup **" or any additions to the name

for row, index in df.iterrows():
    df.loc[row, 'mtDNA1'] = df.loc[row, 'Cq']
    if df.loc[row, 'Sample'] + ' dup' in df['Sample'].values:
        df.loc[row, 'mtDNA2'] = df.loc[df['Sample'] == df.loc[row, 'Sample'] + ' dup', 'Cq'].values[0]
    else:
        df.loc[row, 'mtDNA2'] = np.NAN

df = df.drop(columns=['Cq'])
df = df.dropna()

# calculate standard deviation of each row
df['St.Dev'] = df[['mtDNA1', 'mtDNA2']].std(axis=1)

# Show first 5 rows
df.head()

Unnamed: 0,Well,Sample,mtDNA1,mtDNA2,St.Dev
1,A02,D49,19.214765,19.41713,0.143094
3,A04,D73,19.634851,19.645373,0.00744
5,A06,D89,19.720434,19.648259,0.051035
7,A08,D168,19.902073,19.741493,0.113547
9,A10,D185,19.362185,19.415767,0.037888


In [8]:
# Throw warnings for standard deviations greater than .22

for row, index in df.iterrows():
    if df.loc[row, 'St.Dev'] > .22:
        print(f"\n Warning: Standard deviation for {df.loc[row, 'Sample']} is {round(df.loc[row, 'St.Dev'],ndigits=3)} "
              f"(Sample 1: {round(df.loc[row, 'mtDNA1'],ndigits=3)} vs Sample 2: {round(df.loc[row, 'mtDNA2'], ndigits=2)}) \n")





In [9]:
#Drop index, sort by St.Dev, and download the file
df = df.sort_values(by=['St.Dev'], ascending=False)
df = df.reset_index(drop=True)

df.to_excel("50_gcr_random_name_test_output.xlsx",
          index=False)

In [11]:
"**" in "dej9dje9w** dewidwei dwe"

True