In [1]:
import numpy as np
from PIL import Image
import plotly.express as px
import plotly.colors as pc
from pdb import set_trace
import re
import pandas as pd
import os
import plotly.express as px

In [2]:
# Open the proteomics data and only keep rows where gene has value for all samples
PFP = 'kr_pro_z.csv' # proteomics file path
PD = pd.read_csv(PFP)

PD.index = PD.loc[:,'Gene']
PD = PD.loc[:,PD.columns!='Gene']

DropRowIndices = [(sum(np.isnan(PD.loc[gene,:]))>0) for gene in PD.index]
KeepRowIndices = [not x for x in DropRowIndices]
PD = PD.loc[KeepRowIndices,:]

In [3]:
# Open the mRNA data and only keep rows where gene has value for all samples
MFP = 'kr_rna_z.csv' # mRNA file path
MD = pd.read_csv(MFP)

MD.index = MD.loc[:,'Gene']
MD = MD.loc[:,MD.columns!='Gene']

DropRowIndices = [(sum(np.isnan(MD.loc[gene,:]))>0) for gene in MD.index]
KeepRowIndices = [not x for x in DropRowIndices]
MD = MD.loc[KeepRowIndices,:]

In [4]:
# Open the localization data
LFP = 'SubCellBarcode.MCF7.0622.txt'
LD = pd.read_csv(filepath_or_buffer=LFP,sep='\t')
LD.index = LD.loc[:,'Protein']
LD = LD.loc[:,LD.columns!='Protein']

In [5]:
# Remove unclassified class
NotUnclassInd = LD.loc[:,'Localization'] != 'Unclassified'
LD = LD.loc[NotUnclassInd,:]

In [6]:
# Sanity check
np.unique(LD.loc[:,'Localization'])

array(['Cytosol', 'Mitochondria', 'Nuclear', 'Secretory'], dtype=object)

In [7]:
# Keep only rows whose genes are represented in both data sets
IntersectingGenes = [value for value in PD.index if ((value in MD.index) & (value in LD.index))]
PD = PD.loc[IntersectingGenes,:]
MD = MD.loc[IntersectingGenes,:]
LD = LD.loc[IntersectingGenes,:]
print(len(PD.index))
print(len(MD.index))
print(len(LD.index))

6173
6173
6173


In [8]:
# Create a square, 2D array shape=(18,18) from a 1D array shape=(324,)
def SquareArray(array):
    dimension = int(len(array)**0.5)
    Squared_Array = np.empty(shape = [dimension,dimension])
    for i in np.arange(0,dimension**2):
        row = int(np.floor(i/dimension))
        column = int(i - dimension*row)
        Squared_Array[row,column] = array[i]
    return(Squared_Array)

In [9]:
# Assign a RGB color array to each entry in a 2D array.
#     Store the RGB color arrays in a 3D array where the RGB values go down the 3rd axis (18,18,3)
def CreateImage(array):
    dimension = len(array[0,:])
    RGB_Array = np.empty(shape = [dimension,dimension,3])
    for i in np.arange(0,dimension):
        for j in np.arange(0,dimension):
            value = array[i,j]
            Array_To_Store = np.asarray(GetColor(value),dtype=int)
            RGB_Array[i,j,0:3] = Array_To_Store
    return(RGB_Array)

In [10]:
# Create Gene Images
def GeneImages(PD,MD,gene):
    # Make an array for each gene where the protein data is followed by the mRNA value for every patient
    # patient1  patient1  patient2 patient2  ...  patient98  patient98
    #   prot     mRNA       prot     mRNA           prot       mRNA

    # Each image will have room for 162 (18*18/2) samples
    n_sample_spaces = 162
    
    # The number of samples in the given data set
    n_samples = len(PD.columns)
    
    GeneArray = np.array([])
    for i in np.arange(0,n_sample_spaces):
        # if there is data for a sample, access it
        if i<n_samples:
            Array = np.array([PD.loc[gene,:][i],MD.loc[gene,:][i]])
        # if a sample space needs to be written with blank data, do so
        if i>=n_samples:
            Array = np.array([float("NaN"),float("NaN")])

        GeneArray = np.concatenate((GeneArray,Array))
                
    Square_GeneArray = SquareArray(GeneArray)
    # Get the RGB array
    RGB = CreateImage(Square_GeneArray)
    RGB = np.asarray(RGB,dtype=np.uint8)

    # Make an image from the RGB array
    image = Image.fromarray(RGB)
    
    # Get compartment
    compartment = LD.loc[gene,'Localization']
    
    # Display and save the image
    image.save('./Krug_18by18Grid_20230903/'+gene+'.png')
    

In [11]:
# Function to select a color from a pre-defined continuous pallet.
def GetColor(z):
    
    # If there is a data point, find the color corresponding to the data point
    if (not np.isnan(z)) & (z<100):
        max_magnitude = 3
        if z > 3:
            z = 3
        if z < -3:
            z = -3
        # scale measurements from 0 to 1 (min-max scaling == x-min/max-min) 
        coordinate = 1 - (z-(-max_magnitude))/(3-(-max_magnitude))

        color = pc.sample_colorscale('rdbu', samplepoints=float(coordinate), low=0.0, high=1.0, colortype='rgb')

        Color = color[0]
        Color = Color[Color.find('(')+1:Color.find(')')]
        Color = Color.split(",")
        Color = [Color[i].strip() for i in np.arange(0,len(Color))]
        Color = np.array(Color)
    
    # If there is no data, make the color gray
    if np.isnan(z):
        Color = np.array([128,128,128])
    
    Color = Color.astype(int)
    
    return(Color)

In [12]:
i = 0
for gene in PD.index:
    GeneImages(PD,MD,gene)
    if i%100==0:
        print(i)
    i = i+1

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
