In [1]:
import os
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from openpyxl import load_workbook

from scipy import stats as stats
from copy import copy as copy


In [2]:
# read in paths from config file

configDict = {
    'rootDir': '',
    'initialDataPath' : '',
    'QCDataPath' : '',
    # 'labWorksheet01Path':'',
    'projectName':''
}

# with open(os.path.join(workingDir, 'config.txt'),'r') as f:
with open('config.txt','r') as f:
    lines = f.readlines()
    for line in lines:
        if not line.startswith('#'):
            line = line.strip()
            fields = line.split(':')

            if fields[0].strip()=='initialDataPath':
                configDict[fields[0].strip()] = fields[1].strip().strip('\'')
            else:
                configDict[fields[0].strip()] = fields[1].strip().strip('\'')
## ToDo: Add checks to ensure that minimal fields have been populated. Raise errors or warnings


In [3]:
configDict

{'rootDir': '/Users/upton6/Documents/Nanostring/Projects/NS_msWTA/DSP_Output/Annotated/',
 'initialDataPath': 'Initial Dataset Updated.xlsx',
 'QCDataPath': 'QC_Updated.xlsx',
 'projectName': 'Subramaniam_msWTA',
 'DCCPath': ''}

# Define functions

In [4]:
class master_data:
    def __init__(self, dataPath):
        ### import data from excel workbook
        
        self.wb = load_workbook(dataPath)
        self.segWs = self.wb['SegmentProperties']
        self.cntWs = self.wb['BioProbeCountMatrix']
        
        self.segValues = [[y.value for y in x] for x in self.segWs[self.segWs.calculate_dimension()]]
        self.cntValues = [[y.value for y in x] for x in self.cntWs[self.cntWs.calculate_dimension()]]

        self.dropData = False
        
        
    def get_data(self):
        ### Convert nested list to a pandas dataFrame and extract expression data with labels
        cntData = self.cntValues
        cntCols = self.cntValues[0]
        df = pd.DataFrame(self.cntValues)
        cntIndex = [x[0] for x in self.cntValues[1:]]
        cntDF = pd.DataFrame(self.cntValues[1:], index=cntIndex, columns=cntCols)
        self.counts = cntDF.iloc[:,12:]
        self.counts = self.counts.astype(np.float64)      # Convert datatype to float64
        self.priobeInfo = cntDF.iloc[:,:12]
        segCols = self.segValues[0]
        segIndex = [x[4].replace(' | ',('_')) for x in self.segValues[1:]]
        self.segData = pd.DataFrame(self.segValues[1:], index=segIndex, columns=segCols)

        self.dataOrig = self.counts.copy()
        self.dataLog1 = np.log2(self.counts+1)            # Log transform data for QC and analysis steps

        self.probeClass = False                           # Keep a copy of the original data before transformation or normalisation
        # self.probeClass = df.iloc[self.targIdx:,2]      ### Index needs updating here also
        # self.probeClass.rename(index=rowLabels, inplace=True)
        # self.probeClass.rename(index='ProbeClass', inplace=True)

        # ToDo: Need to update probeclass handling and labeling
        
        self.probeClassDict = {
            'Positive': 'A',
            'Negative': 'B',
            'Control': 'C',
            'Endogenous': 'E'}

        return(self.counts, self.priobeInfo, self.segData)

    def get_descriptors(self):
        pass

    def add_class_mean(self, df):
        pass

    def drop_AOIs(self, includes, writeOrig=False):
        pass
        
    def set_threshold(self, threshold):
        self.threshold = threshold
        # ToDo: Check that all values in master data are also included in threshold dataFrame
        # ToDo: Convert threshold data to 0/1 data if needed

    def drop_probes(self, labels):
        pass
        


In [5]:
def plot_SA_Hist(surfArea):
    fig, ax = plt.subplots(figsize=(10,5))
    ax.hist(surfArea, bins=50)
    ax.set_xlabel('AOI Surface Area µm', fontdict=labelFont)
    ax.set_ylabel('Count', fontdict=labelFont)
    ax.set_title('AOI Surface Area distribution', fontdict=titleFont)
    print('Min SA')
    print(min(surfArea))
    print('Max SA')
    print(max(surfArea))
    return(fig)


In [6]:
# Plot log2 transformed raw data before any normalisation

def draw_probe_plot(dataRaw, dataSortedRaw, namedColourList, title, exp=False, violin=False):
    
    fig, ax = plt.subplots(figsize=(15,8))
    
    if exp:
        ax.boxplot(np.exp2(dataRaw.drop(labels=['mean','probeClass'], axis=1).reindex(labels=dataSortedRaw.index).T) -1, sym='-', labels=dataSortedRaw.index)
    else:
        ax.boxplot(dataRaw.drop(labels=['mean','probeClass'], axis=1).reindex(labels=dataSortedRaw.index).T, sym='-', labels=dataSortedRaw.index)

    
    if violin:
        if exp:
            ax.violinplot(np.exp2(dataRaw.drop(labels=['mean','probeClass'], axis=1).reindex(labels=dataSortedRaw.index).T) -1)

    #         ax.boxplot(np.exp2(dataRaw.drop(labels=['mean','probeClass'], axis=1).reindex(labels=dataSortedRaw.index).T) -1, sym='-', labels=dataSortedRaw.index)
        else:
            ax.violinplot(dataRaw.drop(labels=['mean','probeClass'], axis=1).reindex(labels=dataSortedRaw.index).T)

    #         ax.boxplot(dataRaw.drop(labels=['mean','probeClass'], axis=1).reindex(labels=dataSortedRaw.index).T, sym='-', labels=dataSortedRaw.index)
        
        
    else:


        for i,j in enumerate(dataSortedRaw.index):
            y = dataRaw.drop(labels=['mean','probeClass'], axis=1).loc[j]
            colours = [namedColourList[2] if v.split('_')[-1] == 'Tumour' else namedColourList[5] if v.split('_')[-1] == 'TME' else namedColourList[1] for v in y.index]
        #     colours = [colourList[2] if v.split('_')[-1] == 'Tumour' else colourList[5] if v.split('_')[-1] == 'Immune' else colourList[1] for v in y.index]
            y = y
            if exp:
                y = np.exp2(y.values)-1
            else:
                y = y.values

            x = np.random.normal(i+1, 0.1, len(y))

            for i in range(len(x)): 
                ax.plot(x[i], y[i], c=colours[i], marker='.', alpha=0.25)

    ax.set_xticks(np.arange(1,len(dataSortedRaw.index)+1,1))
    ax.set_xlabel=list(dataSortedRaw.index)
    
    print(len(np.arange(0,len(dataSortedRaw.index),1)))
    print(len(list(dataSortedRaw.index)))
    
    ax.tick_params(axis='x', labelrotation = 90)
    

    if exp:
        ax.semilogy()
        ax.set_title(title + ' (untransformed)', size=36)
        ax.set_ylabel('Probe value', size=24)
    else:
        ax.set_title(title + ' (Log2 transformed)', size=36)
        ax.set_ylabel('Log2 probe value', size=24)
#     plt.show()
    
    return(fig)


In [7]:
def probe_GeoMean_Plots(plotData, title=''):
    rows=1
    cols=2
    colours = [namedColourList[2] if x.split('_')[-1] == 'Tumour' else namedColourList[5] if x.split('_')[-1] == 'TME' else namedColourList[1] for x in HKGeoMean.index]

    fig,ax = plt.subplots(rows,cols, sharey=True, gridspec_kw={'width_ratios': [4,1]}, figsize=(15,5))
    ax[0].bar(np.linspace(1,len(plotData),len(plotData)), plotData, color=colours)
    ax[1].hist(plotData, bins=int(len(plotData)/10),orientation='horizontal', color='k')
    ax[0].set_xlim(0,len(plotData))
    
    ax[0].text(2,max(plotData)*.95,'Tumour', size=20, c=namedColourList[2])
    ax[0].text(2,max(plotData)*.825,'TME', size=20, c=namedColourList[5])
    ax[0].text(2,max(plotData)*.7,'Other', size=20, c=namedColourList[1])

    fig.suptitle(title, size=36)
    ax[0].set_ylabel('Probe Value', size=18)
    ax[0].set_xlabel('Probes', size=18)
    ax[1].set_xlabel('Count', size=18)

    fig.tight_layout()

In [8]:
class threshold_probes:
    def __init__(self, data, bins):
        
        self.data = data.drop(labels=['mean','probeClass'], axis=1)
        self.bins = bins
        self.thisHist = plt.hist(self.data.values.flatten(), bins = self.bins)

    def zoom_plot(self, start, end):
        plt.hist(self.data.values.flatten(), bins = self.bins)
        plt.xlim(0,3)        
        
    def check_threshold(self, start, end):
        print(self.thisHist[0][start:end])
        print(self.thisHist[1][start:end])

    def set_threshold_idx(self, idx):
        print(self.thisHist[0][idx])
        print(self.thisHist[1][idx])
        
        self.threshold_idx = idx
        self.threshold = self.thisHist[1][idx]

    def get_filter(self):
        self.ETfilter = self.data >= self.threshold
        return(self.ETfilter)


In [9]:
def binding_density_plot(sampleInfoExternal, selectedInfo, subSelection):
    # print('selectedInfo')
    # print(selectedInfo)
    
    if not (subSelection == None):
        selectedInfo = selectedInfo.loc[subSelection]
    if (type(selectedInfo) == pd.core.series.Series):
        selectedInfo = pd.DataFrame(selectedInfo).T
        
    comboUniques = []
    comboColourDictRev = {}
    for c in selectedInfo.columns:
        thisCol = selectedInfo[c]
        combined = '_'.join(thisCol.values)
        comboUniques.append(combined)
        comboColourDictRev[c] = combined
    comboUniques = sorted(list(set(comboUniques)))
    print('\nNumber of unique combinations: {}'.format(len(comboUniques)))
    # print(comboColourDictRev)

    gradient = np.linspace(0, 1, len(comboUniques))
    gradDict = dict(zip(comboUniques,gradient))
    
    sampleInfoExternal.sort_values(by=['Plate', 'Col', 'Row'], axis=1, inplace=True)
    # Binding Density plot:
    plt.figure(figsize=(40,10))
    my_cmap = plt.get_cmap("nipy_spectral")
    
    colours = []
    for c in sampleInfoExternal.columns:
        colours.append(gradDict[comboColourDictRev[c]])
    
    print('selectedInfo.index')
    print(list(selectedInfo.index))

    fig, ax = plt.subplots(figsize=(20,5))

    
    bar = ax.bar(sampleInfoExternal.columns,
            sampleInfoExternal.loc['BindingDensity'].values.astype(np.float32), 
            color=my_cmap(colours)
           )#, bottom=0)
    ax.set_title('_'.join(selectedInfo.index))
    ax.set_xticklabels(sampleInfoExternal.columns, rotation='vertical')
    # ax.legend()
    plt.show()

# ToDo: Add legend

# Import data from Nanostring initial dataset file.

In [10]:
dataPath = os.path.join(configDict['rootDir'],configDict['initialDataPath'])
dataPath = os.path.join(configDict['rootDir'],configDict['QCDataPath'])

masterData = master_data(dataPath)


counts, probes, segs = masterData.get_data()


counts

# probes

# segs

Unnamed: 0,PFAC_Liver_CD63_CD45_Col1A1 | 001 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 002 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 003 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 004 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 005 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 006 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 007 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 008 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 009 | Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 010 | Full ROI,...,SFAC_Female_2_SMA_CD45_Col1A1 | 032 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 033 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 034 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 035 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 036 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 037 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 038 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 039 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 040 | Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 041 | Full ROI
16995,60.0,74.0,61.0,51.0,66.0,59.0,53.0,90.0,83.0,54.0,...,89.0,76.0,89.0,69.0,65.0,83.0,55.0,85.0,66.0,69.0
21460,120.0,170.0,157.0,123.0,196.0,161.0,227.0,175.0,188.0,204.0,...,268.0,254.0,296.0,152.0,227.0,187.0,129.0,305.0,234.0,228.0
23249,85.0,106.0,71.0,82.0,98.0,106.0,76.0,107.0,138.0,102.0,...,176.0,165.0,167.0,91.0,123.0,110.0,76.0,171.0,119.0,97.0
23932,98.0,77.0,65.0,66.0,74.0,82.0,70.0,90.0,108.0,97.0,...,109.0,100.0,109.0,75.0,104.0,97.0,76.0,132.0,82.0,66.0
28331,70.0,80.0,71.0,65.0,87.0,74.0,72.0,106.0,135.0,89.0,...,108.0,116.0,129.0,70.0,103.0,113.0,82.0,126.0,116.0,67.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20178,61.0,58.0,44.0,42.0,55.0,55.0,47.0,62.0,90.0,54.0,...,65.0,70.0,70.0,53.0,73.0,66.0,54.0,65.0,67.0,57.0
16328,64.0,68.0,80.0,65.0,61.0,73.0,60.0,75.0,100.0,92.0,...,92.0,122.0,91.0,63.0,77.0,100.0,74.0,114.0,91.0,86.0
31320,91.0,72.0,55.0,63.0,92.0,103.0,75.0,98.0,114.0,88.0,...,106.0,124.0,138.0,70.0,78.0,88.0,77.0,124.0,93.0,89.0
20649,27.0,24.0,30.0,17.0,22.0,27.0,19.0,31.0,55.0,29.0,...,38.0,46.0,51.0,26.0,35.0,44.0,16.0,43.0,28.0,29.0


ToDo: Need to check that column names are unique in SegmentProperties!! KO has 2 entries in current version, need to trace down where this has been introduced.

In [11]:
dropCols = ['SlideName', 'ScanLabel', 'ROILabel', 'SegmentLabel',
            'SegmentDisplayName', 'Origin Instrument ID', 'AOISurfaceArea', 
            'AOINucleiCount', 'ROICoordinateX', 'ROICoordinateY', 
            'RawReads', 'AlignedReads', 'DeduplicatedReads', 'TrimmedReads', 
            'StitchedReads', 'SequencingSaturation', 'SequencingSetID', 
            'UMIQ30', 'RTSQ30', 'GeoMxNgsPipelineVersion', 'LOT_Mouse_NGS_Whole_Transcriptome_Atlas_RNA_1_0',
            'ROIID', 'SegmentID', 'ScanWidth', 'ScanHeight', 'ScanOffsetX', 'ScanOffsetY']

keepCols = [x for x in segs.columns if not x in dropCols]

for c in keepCols:
    print(c)
    print(len(set(segs[c])))
    print(set(segs[c]))
    print()

KO
2
{'False', 'True'}

HFHCD
2
{'False', 'True'}

Control
2
{'False', 'True'}

WT
2
{'False', 'True'}

TAA
2
{'False', 'True'}

QCFlags
4
{None, 'Low Percent Aligned Reads,Low Percent Stitched Reads', 'Low Sequencing Saturation', 'Low Percent Aligned Reads'}

CD63
4
{'CD63H', 'CD63L', 'CD63N', 'nil'}

Inclusion
4
{'INCN', 'INCH', 'INCL', 'nil'}

Genotype
1
{'Genotype'}

Lipid_Vacoule
4
{'nil', 'LVN', 'LVL', 'LVS'}

Sex
2
{'Female', 'Male'}

a_Sma
4
{'a_SmaL', 'a_SmaN', 'a_SmaH', 'nil'}

Mouse_ID
75
{'', '10 7f', '13 1e', '13 3b', '10 6d', '146', '131', '130', '12 10b', '12 2b', '160', '159', '17 2d', '128', '10 6a', '140', '12 2f', '10 7b', '154', '152', '147', '144', '127', '13 3a', '148', '10 7e', '14 3h', '141', '15 4b', '142', '10 6f', '158', '10 6b', '156', '15 4c', '14 1g', '12 9j', '157', '14 4c', '12 4 f', '161', '10 7a', '12 2h', '13 3c', '139', '12 4f', '11 5d', '155', '12 9c', '14 2e', '10 6e', '129', '143', '138', '162', '135', '150', '12 5f', '15 3g', '14 5b', '145', '12 

In [12]:
segs

Unnamed: 0,SlideName,ScanLabel,ROILabel,SegmentLabel,SegmentDisplayName,Origin Instrument ID,KO,HFHCD,Control,WT,...,LOT_Mouse_NGS_Whole_Transcriptome_Atlas_RNA_1_0,CD45,Location,Genotype,ROIID,SegmentID,ScanWidth,ScanHeight,ScanOffsetX,ScanOffsetY
PFAC_Liver_CD63_CD45_Col1A1_001_Full ROI,PFAC_Liver_CD63_CD45_Col1A1,PFAC_Liver_CD63_CD45_Col1A1,001,Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 001 | Full ROI,2001G0086,True,True,False,False,...,MWTA12001,CD45L,,KO,4cc89552-e9d2-4107-964c-1e58ac3c6900,71184346-0220-429f-bc3a-bd68a2cb0c02,33657.613281,59290.984375,7747,3122
PFAC_Liver_CD63_CD45_Col1A1_002_Full ROI,PFAC_Liver_CD63_CD45_Col1A1,PFAC_Liver_CD63_CD45_Col1A1,002,Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 002 | Full ROI,2001G0086,True,True,False,False,...,MWTA12001,CD45H,,KO,62b0e64c-edc2-481a-92ca-612beb8c19d5,4cbbed26-11d9-4260-b828-e27ab8376f53,33657.613281,59290.984375,7747,3122
PFAC_Liver_CD63_CD45_Col1A1_003_Full ROI,PFAC_Liver_CD63_CD45_Col1A1,PFAC_Liver_CD63_CD45_Col1A1,003,Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 003 | Full ROI,2001G0086,True,True,False,False,...,MWTA12001,CD45L,,KO,54167cad-475b-4124-950d-5b3fbccec90d,322d8081-7f6e-4ea5-9e00-0ea5eb7e4dee,33657.613281,59290.984375,7747,3122
PFAC_Liver_CD63_CD45_Col1A1_004_Full ROI,PFAC_Liver_CD63_CD45_Col1A1,PFAC_Liver_CD63_CD45_Col1A1,004,Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 004 | Full ROI,2001G0086,True,True,False,False,...,MWTA12001,CD45L,,KO,dbfa4371-a846-4675-9a55-d0791fc3b242,a40307c4-c106-46d1-b722-cc8a148ea65a,33657.613281,59290.984375,7747,3122
PFAC_Liver_CD63_CD45_Col1A1_005_Full ROI,PFAC_Liver_CD63_CD45_Col1A1,PFAC_Liver_CD63_CD45_Col1A1,005,Full ROI,PFAC_Liver_CD63_CD45_Col1A1 | 005 | Full ROI,2001G0086,True,True,False,False,...,MWTA12001,CD45L,,KO,691b4e46-7930-4c5c-a9ac-b1e0008c4b2d,62d00afc-a042-4a76-80d7-022567aee9cf,33657.613281,59290.984375,7747,3122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SFAC_Female_2_SMA_CD45_Col1A1_037_Full ROI,SFAC_Female_2_SMA_CD45_Col1A1,SFAC_Female_2_SMA_CD45_Col1A1,037,Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 037 | Full ROI,2001G0086,False,False,True,True,...,MWTA12001,CD45L,,WT,edfe6a85-bdfa-4640-96d3-ea41a5c44ae5,a9cb78d8-47bb-40da-8710-1bb13e8cff3e,35715.324219,57497.351562,6718,4019
SFAC_Female_2_SMA_CD45_Col1A1_038_Full ROI,SFAC_Female_2_SMA_CD45_Col1A1,SFAC_Female_2_SMA_CD45_Col1A1,038,Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 038 | Full ROI,2001G0086,False,False,True,True,...,MWTA12001,CD45L,,WT,dbae3d6f-234c-4049-a809-6c260b7e0c98,979a4e6a-16ec-401e-a4ca-4f8418d8fd91,35715.324219,57497.351562,6718,4019
SFAC_Female_2_SMA_CD45_Col1A1_039_Full ROI,SFAC_Female_2_SMA_CD45_Col1A1,SFAC_Female_2_SMA_CD45_Col1A1,039,Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 039 | Full ROI,2001G0086,False,False,True,True,...,MWTA12001,CD45L,,WT,85e4d689-f8a7-45c5-b7cd-cbc4c2a9bd97,871d4bde-5c46-40e5-b786-594dfb614bda,35715.324219,57497.351562,6718,4019
SFAC_Female_2_SMA_CD45_Col1A1_040_Full ROI,SFAC_Female_2_SMA_CD45_Col1A1,SFAC_Female_2_SMA_CD45_Col1A1,040,Full ROI,SFAC_Female_2_SMA_CD45_Col1A1 | 040 | Full ROI,2001G0086,False,False,True,True,...,MWTA12001,CD45L,,WT,9c5316e3-e622-4049-9438-20892a288cad,246a72ea-472f-46a4-8a7e-33bce0ea5c1c,35715.324219,57497.351562,6718,4019


In [13]:
set(segs['SlideName'])

{'PFAC_Liver_CD63_CD45_Col1A1',
 'SFAC_Female_2_SMA_CD45_Col1A1',
 'SKC_Liver_Female_2_SMA_CD45_Col1A1',
 'SKC_Liver_Male_SMA_CD45_Col1A1',
 'SKC_Male_Liver_2_SMA_CD45_Col1A1'}

In [14]:
expSlideDict = {}
expSlideDict['exp1'] = {'SlideName': ['PFAC_Liver_CD63_CD45_Col1A1']}
expSlideDict['exp2'] = {'SlideName': ['SFAC_Female_2_SMA_CD45_Col1A1']}
expSlideDict['exp3'] = {'SlideName': ['SKC_Liver_Female_2_SMA_CD45_Col1A1',
                         'SKC_Liver_Male_SMA_CD45_Col1A1',
                         'SKC_Male_Liver_2_SMA_CD45_Col1A1']}

exps = ['exp1', 'exp2', 'exp3']

In [15]:
expIndices = {}


for exp in exps:
    print(exp)

    thisExp = expSlideDict[exp]
    slideNames = thisExp['SlideName']

    masterIndex = []
    
    for s in slideNames:
        # print('s')
        # print(s)
        thisIndex = segs[segs['SlideName'] == s].index
        # print(list(thisIndex))
        masterIndex.extend(list(thisIndex))
    
    expIndices[exp] = masterIndex


exp1
exp2
exp3


In [16]:
expIndices

{'exp1': ['PFAC_Liver_CD63_CD45_Col1A1_001_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_002_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_003_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_004_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_005_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_006_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_007_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_008_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_009_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_010_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_011_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_012_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_013_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_014_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_015_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_016_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_017_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_018_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_019_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_020_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_021_Full ROI',
  'PFAC_Liver_CD63_CD45_Co

In [17]:
expIndices

{'exp1': ['PFAC_Liver_CD63_CD45_Col1A1_001_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_002_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_003_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_004_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_005_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_006_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_007_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_008_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_009_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_010_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_011_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_012_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_013_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_014_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_015_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_016_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_017_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_018_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_019_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_020_Full ROI',
  'PFAC_Liver_CD63_CD45_Col1A1_021_Full ROI',
  'PFAC_Liver_CD63_CD45_Co

In [18]:
for k,exp in expIndices.items():
    # print(exp)
    print(k)
    print(len(exp))

exp1
69
exp2
41
exp3
78


ToDo: Export experimental sets for use in StandR or RUV


 - StandR => countFile, featureAnnoFile, sampleAnnoFile
 - RUV.   => ???

ToDo: Choose column(s) to use for setting up primary experiment sets (eg. slide name) (save as nested dictionary (experiment name : column : value)
ToDo: Choose column variables to use for setting up secondary experiment sets  (save as nested dictionary (experiment name : columns : [values])



In [19]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [20]:
sampleInfoExternal = masterData.segData.copy()

In [21]:
sampleInfoExternal.columns

Index(['SlideName', 'ScanLabel', 'ROILabel', 'SegmentLabel',
       'SegmentDisplayName', 'Origin Instrument ID', 'KO', 'HFHCD', 'Control',
       'WT', 'TAA', 'QCFlags', 'AOISurfaceArea', 'AOINucleiCount',
       'ROICoordinateX', 'ROICoordinateY', 'RawReads', 'AlignedReads',
       'DeduplicatedReads', 'TrimmedReads', 'StitchedReads',
       'SequencingSaturation', 'SequencingSetID', 'UMIQ30', 'RTSQ30',
       'GeoMxNgsPipelineVersion', 'CD63', 'Inclusion', 'Genotype',
       'Lipid_Vacoule', 'Sex', 'a_Sma', 'Mouse_ID', 'Strain', 'Diet', 'ColA1',
       'Morphology', 'LOT_Mouse_NGS_Whole_Transcriptome_Atlas_RNA_1_0', 'CD45',
       'Location', 'Genotype', 'ROIID', 'SegmentID', 'ScanWidth', 'ScanHeight',
       'ScanOffsetX', 'ScanOffsetY'],
      dtype='object')

In [22]:
sampleInfoExternal['TrimmedReads']

PFAC_Liver_CD63_CD45_Col1A1_001_Full ROI       8166994
PFAC_Liver_CD63_CD45_Col1A1_002_Full ROI       9758881
PFAC_Liver_CD63_CD45_Col1A1_003_Full ROI       7537700
PFAC_Liver_CD63_CD45_Col1A1_004_Full ROI       7934115
PFAC_Liver_CD63_CD45_Col1A1_005_Full ROI       7558340
                                                ...   
SFAC_Female_2_SMA_CD45_Col1A1_037_Full ROI     9716622
SFAC_Female_2_SMA_CD45_Col1A1_038_Full ROI     7502563
SFAC_Female_2_SMA_CD45_Col1A1_039_Full ROI    14295781
SFAC_Female_2_SMA_CD45_Col1A1_040_Full ROI    10681395
SFAC_Female_2_SMA_CD45_Col1A1_041_Full ROI    10320545
Name: TrimmedReads, Length: 188, dtype: int64

In [23]:
print(masterData.probeClass)
print(masterData.probeClassDict)
# dataLog1External, sampleInfoExternal = masterData.drop_AOIs('#16_9', writeOrig=True)

nuclei = sampleInfoExternal['AOINucleiCount']
# print(nuclei)
surfArea = sampleInfoExternal['AOISurfaceArea']
# print(surfArea)

# dataLog1External, sampleInfoExternal = masterData.add_class_mean(masterData.dataLog1)

False
{'Positive': 'A', 'Negative': 'B', 'Control': 'C', 'Endogenous': 'E'}


In [24]:
slides = set(segs['SlideName'])

colLookup = dict(zip(slides,range(len(slides))))

In [25]:
colLookup

{'SKC_Liver_Female_2_SMA_CD45_Col1A1': 0,
 'PFAC_Liver_CD63_CD45_Col1A1': 1,
 'SFAC_Female_2_SMA_CD45_Col1A1': 2,
 'SKC_Liver_Male_SMA_CD45_Col1A1': 3,
 'SKC_Male_Liver_2_SMA_CD45_Col1A1': 4}

In [26]:
colMap = [colLookup[x]/len(slides) for x in segs['SlideName']]

# colMap = [colLookup[x] for x in segs['SlideName']]


In [27]:
colMap 

[0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.6,
 0.8,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4,
 0.4