In [None]:
## To Do:

# Add a logging function to track use of files for input and output with timestamps

<div class="alert alert-block alert-success">
A green text box indicates a code cell that must be run, without alteration, to complete the workflow.
</div>

<div class="alert alert-block alert-warning">
An orange text box indicates an optional code cell that doesn't have to be run to complete the workflow, but can be run to complete optional tasks.
</div>

<div class="alert alert-block alert-info">
A blue text box indicates a code cell that requires user input - this cell also must be run to complete the workflow, but the user needs to modify the command in the cell.
</div>

<div class="alert alert-block alert-danger">
In addition, some text boxes contain particularly important information. These will be coloured red.
</div>

# <span style="color:green"> Import python functions </sapan>
<div class="alert alert-block alert-success">
    These packages should all be installed and available in your default environment. eResearch can help with installing modules and setting up environments. 
</div>


In [None]:
import os

import numpy as np
import pandas as pd
import scipy as scipy
import matplotlib.pyplot as plt
import itertools as itertools
import ipywidgets as widgets

from copy import copy as copy
from openpyxl import load_workbook




# <span style="color:green"> Define/Import custom functions </sapan>
<div class="alert alert-block alert-success">
    Custom functions for this workflow are imported from the functions folder. 
</div>

In [None]:
from functions.masterdata import (
        master_data, 
        read_Surf_Areas, 
        readConfig, 
        make_locate_list, 
        enter_locations, 
        read_plate_info,
        getUniqueCombos,
        check_plate_info,
        infer_plate_info)

from functions.plotting import (
        plot_SA_Hist, 
        draw_probe_plot, 
        probe_GeoMean_Plots, 
        threshold_probes, 
        get_colour_mapping, 
        binding_density_plot
    )


# <span style="color:orange"> Configure output options for this run </span>

<div class="alert alert-block alert-warning">
    The writeOutput variable below enables high level control for whether output files are written. This can be turned off to prevent overwriting existing files. <br>
    AutoRunProject allows the selection of a folder location from a projects.txt config file
</div>

In [None]:
writeOutput = True
# writeOutput = False

autoRunProject = True
# autoRunProject = False


## <span style="color:blue"> Read in config file </sapan>
<div class="alert alert-block alert-info">
    If running in autoRunProject mode, a plain text file named "project.txt" can be created in the DSP_EDA_Protein folder. This file is used to hold all current projects, with one project per line. Inactive projects or comment lines start with a #. The project to run must be uncommented. The project names must be the same as the folder name that contains the config files on the same level as the DSP_EDA_Protein folder.<br>
A config file (config.txt) or project file (project.txt) must be present in the folder given in the projects file or can be entered in the text below.
</div>

In [None]:
if autoRunProject:
    try:
        with open('projects.txt', 'r') as f:   ## ToDo: make this robust to lack of file
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                if ((not line.startswith('#')) and (not line.strip()=='')):
                    subfolder = line
    except FileNotFoundError:
        subfolder = input("Enter the name of the working folder (Must be same level as code folder)")
        print('error')
        
else:
    subfolder = input("Enter the name of the working folder (Must be same level as code folder)")

print(subfolder)
os.chdir("../" + subfolder)


In [None]:
configDict = readConfig()

# <span style="color:green"> Import data from Nanostring initial_dataset file </sapan>
<div class="alert alert-block alert-success">
    Initial dataset.xlsx file must be located in the folder specified as rootDir above.
</div>

In [None]:
dataPath = os.path.join(configDict['rootDir'],configDict['initialDataPath'])
print(f'dataPath : {dataPath}\n')

masterData = master_data(dataPath)
# Read in masterr data and clean sample names.
dataExternal, dataLog1External, sampleInfo = masterData.get_data(fix_zeros=True, clean_names=True)
dataLog1External, sampleInfo = masterData.add_class_mean(masterData.dataLog1)
nuclei = sampleInfo.loc['AOI nuclei count']
surfArea = sampleInfo.loc['AOI surface area']


# <span style="color:blue"> Infer the sample locations on the processing plates </sapan>
<div class="alert alert-block alert-info">
    The locations of each individual sample on the processing plates can be inferred from the labworksheet files. Place this file in the rootDir location. If all AOIs have a unique surface area the script should identify the location of each sample automatically. If samples have identical surface areas, sample plate and well (col and row coordinates) will need to be input manually when prompted below.

    NOTE: if samples are re-named or new features are added, the sampleInfoWithWells files should be deleted or manually updated.
</div>

In [None]:
# ToDo: Update master sample info to include plate and well info    !!!!!!!!!!

In [None]:
infoPath = os.path.join(configDict['rootDir'], configDict['sampleInfoFile'])

worksheets = [configDict[x].split(',') for x in configDict.keys() if x.startswith('labWorksheet')]
worksheets = list(itertools.chain(*worksheets))

if os.path.isfile(infoPath):
    print(f'reading plate info from file : {infoPath}')
    sampleInfo = read_plate_info(masterData, infoPath)
    masterData.sampleInfo = sampleInfo
    writePlateInfoOutput = False
else:
    print('inferring plate info')
    sampleInfo = infer_plate_info(sampleInfo ,configDict['rootDir'], worksheets)
    masterData.sampleInfo = sampleInfo
    writePlateInfoOutput = True
    
plateInfoComplete = check_plate_info(sampleInfo)

while not(plateInfoComplete):
    print('completing plate info')
    sampleInfo = infer_plate_info(sampleInfo ,configDict['rootDir'], worksheets)
    masterData.sampleInfo = sampleInfo
    writePlateInfoOutput = True
    plateInfoComplete = check_plate_info(sampleInfo)

## Write sample info with well data included to file:
if writePlateInfoOutput:
    sampleInfo.to_csv(os.path.join(dataPath[:dataPath.rfind('/')],'sampleInfo_with_wells.csv'))

    
# ToDo: Check whether sampleInfo has been updated in masterData

In [None]:
# Check if any values for weill, row, col or plate are blank
sampleInfo.loc[['Well','Row','Col','Plate']].isnull().any().any()

In [None]:
# Sanity check
# View sampleInfo entries for weill, row, col or plate
sampleInfo.loc[['Well','Row','Col','Plate']]

# <span style="color:green"> Basic QC and data overview plots </sapan>
<div class="alert alert-block alert-success">
    The cells below output basic QC plots to start getting a look at and feel for the data. 
</div>


## <span style="color:orange"> Choose factors of interest for data visualisation </span>

<div class="alert alert-block alert-warning">
Select factors to use for data visualisation. These factors will be used to generate colour mappings for plotting data. <br>
    Re-running the checkbox generation cell below will clear selections and generate empty checkboxes.
</div>

In [None]:
# ToDo: Check if selected data is written to config file
# If written already, use this by default, otherwise ask for selection

<i>INFO: The input cell does not advance to the next cell. Run the first cell below to generate the checkboxes, then select checkboxes and run the second cell below to read the selected checkboxes. Re-running the first cell below will clear selections and generate empty checkboxes.</i>

In [None]:
# input("Press Enter to continue and choose factors of interest via the checkboxes generated in the next cell.")

In [None]:
# checkbox generation cell 

# if (len(configDict['selectedData']) == 0):

data = list(sampleInfo.index)
checkboxes = [widgets.Checkbox(value=False, description=label) for label in data]
output = widgets.VBox(children=checkboxes)
display(output)

In [None]:
# Write configDict['selectedData'] to config.txt file
# if (len(configDict['selectedData']) == 0):
for i in range(0, len(checkboxes)):
    if checkboxes[i].value == True:
        if not ([checkboxes[i].description] in configDict['selectedData']):
            configDict['selectedData'] = configDict['selectedData'] + [checkboxes[i].description]
print('configDict[selectedData] :\t{}'.format(configDict['selectedData']))
print('length : {}'.format(len(configDict['selectedData'])))

<div class="alert alert-block alert-warning">
To write selected factors to the config file, set newConfig to true in the cell below. Make sure the config file contains a single selectedData line after running this.
</div>

In [None]:
# ToDo: update to determine whether a new config has been added and should be written to config.txt. Perhaps write a final version of the config file once whole script has been run.
# newConfig = True
newConfig = False
if newConfig:
    with open('config.txt', 'a') as config:
        config.write('selectedData : ')
        config.write(', '.join([x.description for x in checkboxes if x.value == True]))
        config.write('\n')



### <span style="color:green"> View and clean annotations (Unique factor variable name check)</sapan>
<div class="alert alert-block alert-success">
    Quick check that variable names are entered correctly. 
</div>

<div class="alert alert-block alert-danger">
Using clean and consistent data is essential for correctly selecting sample groups for analysis. This section is designed to identify the unique variables that have been entered for each factor of interest at the sample annotation stage. Please review the factor variables carefuly to make sure that all expected variables are present, and there are no unexpected variations of v ariable names. variables are case sensitive and any differences will generate a unique variable. 
</div>

In [None]:
selectedFactors = configDict['selectedData']
print(selectedFactors)
selectedFactors.extend(['Row', 'Col'])
print(selectedFactors)
selectedFactors = list(set(selectedFactors))
configDict['selectedData'] = selectedFactors.copy()

selectedInfo = sampleInfo.loc[selectedFactors]
selectedInfo = selectedInfo.fillna(value='AutoFilledBlank')
# selectedInfo.drop_duplicates(keep='first', inplace=True)
selectedInfo

In [None]:
# Unique factor variables are printed below

## ToDo: Make sure all names are printed and add count of variable names

infoValues = selectedInfo.stack().groupby(level=0).apply(lambda x: sorted(x.unique()))
infoValues

#### factors unique combinations table

In [None]:
# infoValues = pd.DataFrame(infoValues).T

# # ToDo: Break lists down into separate rows

# infoValues.style.set_table_styles(
#                         [{
# 'selector': 'th',
#    'props': [
# ('background-color', 'black'),
#        ('color', 'white'),
#        ('border-color', 'black'),
#        ('border-style ', 'solid'),
#        ('border-width','1px')]  
# },
# {
#    'selector': 'td',
#    'props': [
#        ('border-color', 'black'),
#        ('border-style ', 'solid'),
#        ('border-width','1px')]
# },
# {'selector': '.row_heading',
#       'props': [('display', 'none')]},
# {'selector': '.blank.level0',
#       'props': [('display', 'none')]}])

In [None]:
getUniqueCombos(selectedInfo)

In [None]:
# ToDo: Add in functionality to view number of samples in different subgroups

### <span style="color:green"> Write Factor lookup csv file </sapan>
<div class="alert alert-block alert-success">
    Quick check that variable names are entered correctly. 
</div>


In [None]:
# Check the current working directory. Should be folder for config files etc.

os.getcwd()

In [None]:
with open('factor_lookup.tsv', 'w') as f:
    for idx in infoValues.index:
        f.write(idx)
        f.write(': ')
        f.write('\t'.join(infoValues[idx]))
        f.write('\n')

# factor: variable1, variable2

## <span style="color:orange"> Choose factors to use for colour selection in the plots below </sapan>

<div class="alert alert-block alert-warning">
Use the cell below to select the factors to use for colour map generation in some of the figures below.<br><br>
Set re-select to True to select new factors, or False to continue with current selection.
</div>

In [None]:
# selected_data = []
reselect = True
reselect = False

if reselect:
    print('reselecting')
    data = list(selectedInfo.index)
    checkboxes = [widgets.Checkbox(value=False, description=label) for label in data]
    output = widgets.VBox(children=checkboxes)
    display(output)

In [None]:
if reselect:
    print('reselecting')
    for i in range(0, len(checkboxes)):
        if checkboxes[i].value == True:
            configDict['selectedData'] = configDict['selectedData'] + [checkboxes[i].description]
    print(configDict['selectedData'])


In [None]:
selectedInfo = selectedInfo.loc[configDict['selectedData']]
selectedInfo

In [None]:
configDict['selectedData']

In [None]:
configDict['selectedData']

ToDo: Add colour lookup dictionary

## <span style="color:green"> Plot distribution of AOI surface areas </sapan>
<div class="alert alert-block alert-success">
    Ideally, the AOI seze can be kept the same for all AOIs, however this is likely impossible or impractical for most studies. Here we plot the distribution of AOI sizes to get a clear view the different AOI sizes present in our data set. This may become important in later steps when deciding on normalisation approaches, data thresholding and exclusion of outlier AOIs. 
</div>


In [None]:
# ToDo: Add colouring for different AOI types
SAHist = plot_SA_Hist(surfArea)


## <span style="color:green"> Plot Binding Density histograms (in plate order) </sapan>
<div class="alert alert-block alert-success">
Binding density is calculated per lane on the nCounter cartridge. This is equivalent to the columns on hte hybridisation plate. The binding density should be similar, but does not need to be exactly the same.
</div>

In [None]:
# generate binding density plots

binding_density_plot(sampleInfo, selectedInfo, configDict['selectedData'])
# binding_density_plot(sampleInfo, selectedInfo, 'Col')

for s in selectedFactors:
    binding_density_plot(sampleInfo, selectedInfo, s)
    

## <span style="color:green"> Visualise raw probe values before any normalisation </sapan>
<div class="alert alert-block alert-success">
Checking the raw probe values can help identify any systemic issues that may be present in the data set.
</div>

<div class="alert alert-block alert-danger">
The plots below are aranged with the individual probes on the x axis and probe count on the y axis. each sample is represented by a dot for the probe value for each probe.
</div>

In [None]:
# Show the selected data rows that are currently selected
print('configDict[\'selectedData\'] :\t' + str(configDict['selectedData']))

In [None]:
# Show some of the log transformed data to be plotted
dataLog1External

In [None]:
dataSortedRaw = dataLog1External.sort_values(by = ['probeClass', 'mean'], ascending=[True,True])

# for s in configDict['selectedData']:
for s in selectedFactors:
    draw_probe_plot(dataSortedRaw, sampleInfo, selectedInfo, s, 'Raw Probe Values \"' + s + '\"', savefig=os.path.join(os.getcwd(),'probe_Plot.svg'))
    


In [None]:
dataLog1External

In [None]:
sampleInfo

In [None]:
# ToDo: Add cmap for a single factor

# Plot of non-log transformed HYB-NEG values vs log transformed HYB-POS values
plt.scatter(np.exp2(dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['HYB-NEG'])-1,
            dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['HYB-POS'])
plt.title('Negative vs Positive probe values')
plt.xlabel('HYB-NEG')
plt.ylabel('HYB-POS (Log2 transformed)')

In [None]:
# ToDo: Add cmap for a single factor

# Plot of non-log transformed HYB-NEG values vs AOI surface area values
plt.scatter(np.exp2(dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['HYB-NEG'])-1,
        surfArea.reindex(dataLog1External.drop(labels=['mean','probeClass'],axis=1).columns))
plt.title('Negative probe values vs Surface Area ')
plt.xlabel('HYB-NEG')
plt.ylabel('AOI Surface Area')

## <span style="color:red"> Raw probe QC questions / checks: </sapan>

<div class="alert alert-block alert-danger">
Info on important questions to consider from QC steps
</div>

# <span style="color:orange"> Determine thresholding value for limit of detection for assay </span>

<div class="alert alert-block alert-warning">
In the following section a threshold can be set for primary interrogation of the assay limit of detection and probes and samples that may be usable for further analysis.<br><br>

After running through the following cells and deciding a threshold to use for this study, this threshold can be added to the config file for be automaticcaly imported for future runs.<br><br>

The threshold shoud be set as the first local minimum closest to the start of the 'real' values.


Flesh this documentation out more !!!

</div>

In [None]:
# ToDo: Add plots where hyb-neg and negative controls are shown with separate cmaps

In [None]:
# Try to read in probeThreshold from config file.

try:
    probeThresholdIdx = configDict['probeThresholdIdx']
    probeThresholdSet = True
    print('Previous probe threshold was found')
except KeyError:
    probeThresholdSet = False
    print('No probe threshold was found')


In [None]:
thresholding = threshold_probes(dataLog1External, 300)

In [None]:
#Set index value to use for minimum expression threshold from thresholding.

if not(probeThresholdSet):
    probeThresholdIdx = 38 # Threshold can be set here if it was not read into the configDict

thresholding.set_threshold_idx(probeThresholdIdx)

In [None]:
thresholding.zoom_plot(0.5,3)

In [None]:
thresholding.check_threshold(0,50)
thresholding.check_threshold(25,40)

In [None]:
thresholding.check_threshold(32,33)


In [None]:
probeThresholdIdx

In [None]:
print(f'threshold Index :\t%d'%(probeThresholdIdx))
print(f'threshold point :\t%6.3f'%(thresholding.threshold))
ETFiltered = thresholding.get_filter()

In [None]:
if not(probeThresholdSet):
    with open('config.txt', 'a') as f:
        f.write('probeThresholdIdx : ')
        f.write(str(probeThresholdIdx))
        f.write('\n')

## <span style="color:green"> Identify outlier AOIs and probes </sapan>
<div class="alert alert-block alert-success">
    text here 
</div>


## <span style="color:green"> Plot counts of AOIs and probes passing thresholds </sapan>
<div class="alert alert-block alert-success">
text here<br><br>

describe fail AOI and fail probe files
</div>

In [None]:
exportPath = dataPath[:dataPath.rfind('/')]

In [None]:
# get counts of how many probes are above the expression threshold for each AOI
passAOI = ETFiltered.sum()
thisMin = min(passAOI)
thisMax = max(passAOI)
print(f'thisMin :\t{thisMin}')
print(f'thisMax :\t{thisMax}')

plt.title('Probes passing threshold per AOI')
plt.xlabel('probes passing threshold')
plt.ylabel('AOI count')
plt.hist(passAOI.values, bins=20)
plt.xlim(thisMin-2,thisMax+2)
# plt.semilogy()

In [None]:
passAOI.sort_values().index[:5]

In [None]:
failAOIs = passAOI[passAOI < 35]
failAOIs.index

In [None]:
sampleInfo[failAOIs.index]

In [None]:
# ToDo: Move this to config file
# writeOutput = True
if writeOutput:
    failAOIs.to_csv(os.path.join(exportPath,'failAOIs.csv'))
# writeOutput= False


In [None]:
# plot how many AOIs are above the expression threshold for each probe

passProbe = ETFiltered.sum(axis=1)
thisMin = min(passProbe)
thisMax = max(passProbe)

plt.title('AOIs where probe passes threshold (AOI count per probe')
plt.xlabel('AOIs passing threshold')
plt.ylabel('probe count')
plt.hist(passProbe.values, bins=30)
plt.xlim(thisMin-5,thisMax+5)
# plt.semilogy()

In [None]:
# ToDo: Ask for a value to use for fail probes cutoff
# Use the group by factor function (need to finish and move to a method)

In [None]:
# ToDo: Sort this output
failProbes = passProbe[passProbe < 48]
failProbes

In [None]:
failProbes = passProbe[passProbe < 100]
failProbes

In [None]:
# ToDo: Move failProbes to config file ?

# writeOutput = True
if writeOutput:
    failProbes.to_csv(os.path.join(exportPath,'FailProbes.csv'))
# writeOutput= False


In [None]:
masterData.set_threshold(ETFiltered)

In [None]:
# masterData.threshold


# <span style="color:green"> ERCC correct data </sapan>
<div class="alert alert-block alert-success">
    ERCC correction is the first essential step in data normalisation. This step accounts for differences in hybridisation efficiency and pipetting inaccuracies.
</div>

In [None]:
erccData = masterData.ERCC_norm()
erccData, _ = masterData.add_class_mean(erccData)


In [None]:
erccData

# <span style="color:green"> Drop outlier AOIs and probes </sapan>
<div class="alert alert-block alert-success">
Some samples or probes may need to be dropped permanently. This can be done in the following steps.<br><br>


</div>

In [None]:
# ToDo: Add in options for interactively dropping samples.
# ToDo: read in AOIs and probes to be droped from csv files?


In [None]:
dropSamples = False
# dropSamples = True
dropSamplesTemp = []


In [None]:
# ToDo: Test this code with multiple projects

if dropSamples:
    if ('dropSamples' in configDict.keys()):
        dropSamples = configDict['dropSamples'].split(',')
        dropSamples = [x.strip() for x in dropSamples]
        dropSamples = [x.strip('\'') for x in dropSamples]
        dropSamples =pd.Index(dropSamples)



    print('dropping samples')
    erccData, sampleInfo = masterData.drop_AOIs(list(dropSamples), writeOrig=True)
    print(f'erccData.shape {erccData.shape}')
    erccData = masterData.ERCC_norm()
    erccData, sampleInfo = masterData.add_class_mean(masterData.ERCCData)

erccDataSorted = erccData.sort_values(by = ['probeClass', 'mean'], ascending=[True,True])



In [None]:
# # # ToDo: Tidy up handling of datasets within master data. There are too many copies at the moment!
# # # ERCC correction needs to be repeated after dropping AOIs

# if (type(dropSamples) == pd.core.indexes.base.Index):
#     print('dropping samples')
#     erccData, sampleInfo = masterData.drop_AOIs(list(dropSamples), writeOrig=True)
#     print(f'erccData.shape {erccData.shape}')
#     erccData = masterData.ERCC_norm()
#     erccData, sampleInfo = masterData.add_class_mean(masterData.ERCCData)

# erccDataSorted = erccData.sort_values(by = ['probeClass', 'mean'], ascending=[True,True])


In [None]:
if (not(type(dropSamples) == pd.core.indexes.base.Index) and (len(dropSamplesTemp) != 0)):
    print('writing drop samples to config file.')
    with open('config.txt', 'a') as config:
        config.write('dropSamples : ')
        config.write(', '.join([x for x in dropSamplesTemp]))
        config.write('\n')



# <span style="color:green"> Plot negative controls and housekeeping controls from raw data </sapan>
<div class="alert alert-block alert-success">
It's probably not neccessary to view the following, but lets just have a quick look at how the control values for the raw data is distributed to make sure there are no extreme outliers before ERCC normalisation.

</div>

In [None]:
# Calculate Ig control Geometric mean and Housekeeping gene Geometric mean
# Calculating the mean of log transformed data is analagous to calculating the geometric mean of the non-transformed data

HKGeoMean = dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc[masterData.HK].mean()
HKGeoMean = HKGeoMean.sort_values()

IgGeoMean = dataLog1External.drop(labels=['mean','probeClass'], axis=1).reindex(labels=HKGeoMean.index, axis=1).loc[masterData.IgCTLs].mean()

HKIgCtls = masterData.HK.copy()
HKIgCtls.extend(masterData.IgCTLs)

HKIgCtlGeoMeans = dataLog1External.drop(labels=['mean','probeClass'], axis=1).reindex(labels=HKGeoMean.index, axis=1).loc[HKIgCtls].mean()

HKIgCtls = masterData.HK.copy()
HKIgCtls.extend(masterData.IgCTLs)

print(f'masterData.HK :\t{masterData.HK}')
print(f'HKIgCtls :\t{HKIgCtls}')
print(f'HKGeoMean :\t{HKGeoMean}')


In [None]:
# ToDo: Fix up cmap for these plots. Choose a factor to colour by.

probe_GeoMean_Plots(HKGeoMean, 'House-Keeping probe geometric mean distribution')

probe_GeoMean_Plots(IgGeoMean, 'Ig probe geometric mean distribution')

In [None]:
probe_GeoMean_Plots(HKIgCtlGeoMeans, 'HK and Ig probe geometric mean distribution')

In [None]:
HKIgCtls = masterData.HK.extend(masterData.IgCTLs)
print(f'masterData.HK :\t{masterData.HK}')


In [None]:
###
# Matrix of HK expression plots

my_cmap, colours = get_colour_mapping(sampleInfo, selectedInfo.loc['Col'])
rows = 3
cols = 3

fig, axes = plt.subplots(rows,cols, figsize=(15,15))


S6 = [(x) for x in dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['S6'].values]
H3 = [(x) for x in dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['Histone H3'].values]
GAPDH = [(x) for x in dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['GAPDH'].values]

labels = ['S6', 'H3', 'GAPDH']
dataList = [S6, H3, GAPDH]

axMin = int(min([min(x) for x in dataList]))
axMax = int(max([max(x) for x in dataList]))+1


for r in range(rows):
    for c in range(cols):
        if r==c: # This is a diagonal, insert a label
            axes[r][c].text(0.5,0.5, labels[r],ha='center', va='center', transform=axes[r][c].transAxes,fontsize=36)
        else:
            axes[r][c].scatter(dataList[c], dataList[r], color=my_cmap(colours)) #ToDo: Add a cmap, c=colours)
            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(dataList[c], dataList[r])
            axes[r][c].text(0.9,0.9, "r_value = " + str(r_value)[:5],ha='right', va='center', transform=axes[r][c].transAxes)
            axes[r][c].text(0.9,0.8, "slope = " + str(slope)[:5],ha='right', va='center', transform=axes[r][c].transAxes)
            axes[r][c].set_xlim(axMin,axMax)
            axes[r][c].set_ylim(axMin,axMax)



In [None]:

# Matrix of IgG expression plots
rows = 3
cols = 3

fig, axes = plt.subplots(rows,cols, figsize=(15,15))


mIG1 = [(x) for x in dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['Ms IgG1'].values]
mIG2 = [(x) for x in dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['Ms IgG2a'].values]
rIG = [(x) for x in dataLog1External.drop(labels=['mean','probeClass'], axis=1).loc['Rb IgG'].values]

labels = ['Ms_IgG1', 'Ms_IgG2b', 'Rb_IgG']
# labels = ['Rt_IgG2a', 'Rt_IgG2b', 'Rb_IgG']
dataList = [mIG1,mIG2,rIG]

axMin = int(min([min(x) for x in dataList]))
axMax = int(max([max(x) for x in dataList]))+1


for r in range(rows):
    for c in range(cols):
        if r==c: # This is a diagonal, insert a label
            axes[r][c].text(0.5,0.5, labels[r],ha='center', va='center', transform=axes[r][c].transAxes,fontsize=36)
        else:
            axes[r][c].scatter(dataList[c], dataList[r], color=my_cmap(colours)) # ToDo: Add cmap , c=colours)
            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(dataList[c], dataList[r])
            axes[r][c].text(0.9,0.9, "r_value = " + str(r_value)[:5],ha='right', va='center', transform=axes[r][c].transAxes)
            axes[r][c].text(0.9,0.8, "slope = " + str(slope)[:5],ha='right', va='center', transform=axes[r][c].transAxes)
            axes[r][c].set_xlim(axMin,axMax)
            axes[r][c].set_ylim(axMin,axMax)

In [None]:
# Matrix of IgG expression plots
rows = 4
cols = 4

fig, axes = plt.subplots(rows,cols, figsize=(15,15))

labels = ['HKGMean', 'IgGMean', 'Area', 'Nuclei']
## Ensure HKGeoMean 
dataList = [HKGeoMean.loc[dataLog1External.columns[:-2]], 
            IgGeoMean.loc[dataLog1External.columns[:-2]], 
            np.log2(list(surfArea.values)), 
            np.log2(list(nuclei.values))]

axMin = int(min([min(x) for x in dataList]))
axMax = int(max([max(x) for x in dataList]))+1


for r in range(rows):
    for c in range(cols):
        if r==c: # This is a diagonal, insert a label
            axes[r][c].text(0.5,0.5, labels[r],ha='center', va='center', transform=axes[r][c].transAxes,fontsize=36)
        else:
            axes[r][c].scatter(dataList[c], dataList[r], color=my_cmap(colours)) # ToDo: Add cmap , c=colours)
            slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(dataList[c], dataList[r])
            axes[r][c].text(0.9,0.9, "r_value = " + str(r_value)[:5],ha='right', va='center', transform=axes[r][c].transAxes)
            axes[r][c].text(0.9,0.8, "slope = " + str(slope)[:5],ha='right', va='center', transform=axes[r][c].transAxes)
            axes[r][c].set_xlim(axMin,axMax)
            axes[r][c].set_ylim(axMin,axMax)
                
            

# <span style="color:green"> Load-in default QC data from DSP analysis suite for further data QC </sapan>
<div class="alert alert-block alert-success">
load in QC data from DSP analysis suite.
</div>

In [None]:
QCmasterData = master_data(os.path.join(configDict['rootDir'],configDict['QCDataPath']))
QCdataExternal, QCdataLog1External, QCsampleInfo = QCmasterData.get_data()

print(QCmasterData.probeClass)
print(QCmasterData.probeClassDict)

if (type(dropSamples) == pd.core.indexes.base.Index):
    QCdataLog1External, QCsampleInfo = QCmasterData.drop_AOIs(list(dropSamples), writeOrig=True)
    print(QCdataLog1External.shape)
    print(QCsampleInfo.shape)

QCdataLog1External, QCsampleInfo = QCmasterData.add_class_mean(QCmasterData.dataLog1)


In [None]:
# View log transformed QC data

QCdataLog1External

## <span style="color:green"> Visualise QC OR ERCC data before normalisation </sapan>
<div class="alert alert-block alert-success">
Check for obvious outliers etc.
</div>

In [None]:
# Sort data by probeclass then mean for nicer visualisation on plots
QCdataSortedERCC = QCdataLog1External.sort_values(by = ['probeClass', 'mean'], ascending=[True,True])


In [None]:
for s in configDict['selectedData']:
    draw_probe_plot(erccDataSorted, sampleInfo, selectedInfo, s, 'ERCC Probe Values \"' + s + '\"')
    

In [None]:
for s in configDict['selectedData']:
    draw_probe_plot(QCdataSortedERCC, sampleInfo, selectedInfo, s, 'ERCC Probe Values \"' + s + '\"')
    

In [None]:
QCSortedIndex = sorted(masterData.probeClass.index, key=lambda x: masterData.probeClassDict[masterData.probeClass[x]])
# QCSortedIndex
QCSortedProbes = masterData.probeClass[QCSortedIndex]
QCSortedProbes.name = "Code.Class"
QCSortedProbes

# list(QCSortedProbes.index)


In [None]:
erccData

# <span style="color:green"> Export data fopr further analysis </sapan>
<div class="alert alert-block alert-success">
Text descriuption here
</div>

# <span style="color:orange"> Export ERCC corrected data </span>

<div class="alert alert-block alert-warning">
Choose whether to output manually ERCC corrected data or ERCC corrected data from DSP QC output
</div>

In [None]:
ERCCType = 'manual'
# ERCCType = 'DSP_QC'


if (ERCCType == 'manual'):
    # Export manually ERCC corrected data for further analysis
    print('manual ERCC data export')
    QCExport = erccData.sort_values(by = ['probeClass', 'mean'], ascending=[True,False])
    probeOrder = QCExport.index
    QCExport = QCExport.drop(labels=['mean','probeClass'], axis=1)
    QCExport
elif (ERCCType == 'DSP_QC'):
    # Export Nanostring QC'd ERCC corrected data for further analysis
    print('DSP QC ERCC data export')
    QCExport = QCmasterData.dataOrig.copy()
    posNeg = masterData.dataOrig.loc[['HYB-NEG','HYB-POS'],]
    QCExport = pd.concat([QCExport,posNeg])
    QCExport = QCExport.loc[QCSortedIndex,]
    QCExport = pd.concat([QCSortedProbes, QCExport], axis = 1)
    QCExport.index.name = 'Name'
    QCExport
else:
    print('ERCC correction type not valid, exiting.')
    # break


In [None]:
# Reverse log2 transformation before normalisation
QCExport = pow(2, QCExport)

QCExport = pd.concat([QCSortedProbes, QCExport], axis = 1)
QCExport = QCExport.loc[list(probeOrder)]
QCExport.index.name = 'Name'
QCExport


In [None]:
# check if normalisation directory exists and create it if not

normDir = os.path.join(configDict['rootDir'], 'Normalisation')
if not (os.path.isdir(normDir)):
    os.mkdir(normDir)

print(normDir)


In [None]:
# Write normalised data to file with _preNorm suffix

project = configDict['projectName']
qcCSV = 'QC_' + project + '_preNorm.csv'

# writeOutput= False
writeOutput = True
if writeOutput:
    QCExport.to_csv(os.path.join(normDir, qcCSV))
# writeOutput= False


# <span style="color:green"> Run NanoString Normalisation script in R </sapan>
<div class="alert alert-block alert-success">
Text descriuption here
</div>

In [None]:
cmd = 'Rscript ../DSP_EDA_Protein/NSNorm.R -d ' + normDir + ' -f ' + qcCSV
print(cmd)
os.system(cmd)


### ToDo: Collate figures into a pdf