# Spatial Preprocessing

In [1]:
import pandas as pd
import numpy as np
import xlsxwriter

dir = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/Pseudo-cells_preprocess.xlsx'

df_old = pd.read_excel(dir)

In [2]:
nrow = len(df_old)
ncol = len(df_old.columns)-1
df = df_old.iloc[:, 1:(ncol+1)]

In [3]:
# removing background
for x in range(ncol):
    sort = sorted(df.iloc[:, x])
    bottom_1percent = round(len(df)*.01)
    min_bg = np.mean(sort[0:bottom_1percent])
    std = np.std(df.iloc[:, x])
    threshold = min_bg + 3*std
    for y in range(nrow):
        old_num = df.iloc[y, x]
        if old_num < threshold:
            df.iloc[y, x] = 0
        else:
            pass
df_bg = df

In [4]:
# dropping cells that did not pass threshold
drop = []
for x in range(nrow):
    sort_row = sorted(df_bg.iloc[x, :])
    if sort_row[23] == 0:
        drop.append(x)
df_drop = df_bg.drop(drop)
print(len(drop))

43584


In [5]:
# log normalization
from sklearn.preprocessing import normalize

colnames = df_drop.columns

df_cellnorm = pd.DataFrame(normalize(df_drop, axis=1), columns=colnames)
df_celllognorm = np.log2(df_cellnorm+1)

df_celllognorm

Unnamed: 0,Protein 1,Protein 2,Protein 3,Protein 4,Protein 5,Protein 6,Protein 7,Protein 8,Protein 9,Protein 10,...,Protein 22,Protein 23,Protein 24,Protein 25,Protein 26,Protein 27,Protein 28,Protein 29,Protein 30,Protein 31
0,0.520282,0.000000,0.000000,0.000000,0.379572,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.481658,0.0,0.000000,0.406144,0.000000,0.000000,0.403216,0.000000,0.533466
1,0.590961,0.390072,0.000000,0.000000,0.532074,0.000000,0.000000,0.000000,0.000000,0.514206,...,0.000000,0.000000,0.0,0.000000,0.450224,0.445518,0.000000,0.000000,0.000000,0.000000
2,0.520245,0.519057,0.000000,0.000000,0.475617,0.000000,0.000000,0.000000,0.000000,0.398242,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.542928,0.487512
3,0.584419,0.000000,0.000000,0.391454,0.473329,0.000000,0.546782,0.000000,0.000000,0.000000,...,0.535578,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.534775,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.369309,0.000000,0.000000,0.477346,0.448622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4811,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.309439,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.397389,0.376355,0.289251,0.369275,0.000000
4812,0.610406,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.347289,0.000000,...,0.486374,0.000000,0.0,0.000000,0.000000,0.000000,0.401505,0.000000,0.000000,0.508725
4813,0.580654,0.000000,0.281124,0.000000,0.000000,0.320831,0.000000,0.000000,0.000000,0.000000,...,0.445974,0.520070,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.422252,0.000000
4814,0.000000,0.499692,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.453639,...,0.000000,0.000000,0.0,0.539885,0.000000,0.000000,0.000000,0.000000,0.000000,0.548403


In [6]:
# write data to xlsx and add a default header
target = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/Intensities_minBG.xlsx'
col_names = df_celllognorm.columns

writer = pd.ExcelWriter(target, engine='xlsxwriter')
df_celllognorm.to_excel(writer, startrow=1, index=False, header=False)

workbook  = writer.book
worksheet = writer.sheets['Sheet1']

for idx, val in enumerate(col_names):
    worksheet.write(0, idx, val)

writer.save()

# Organizing Cluster Data

In [7]:
import pandas as pd
import numpy as np
import xlsxwriter

coords_dir = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/cellcoordinates.xlsx'
coords_data = pd.read_excel(coords_dir)

seurat_dir = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/UMAP_clusters.xlsx'
seurat_data = pd.read_excel(seurat_dir)

In [8]:
coords_nrow = len(coords_data)

data_dict = {'cell' : list(range(1,coords_nrow+1)),
            'x position' : coords_data['Xposition'],
            'y position' : coords_data['Yposition'],
            'cluster_num' : [None] * coords_nrow,
            'Cluster 0' : [0] * coords_nrow,
            'Cluster 1' : [0] * coords_nrow,
            'Cluster 2' : [0] * coords_nrow,
            'Cluster 3' : [0] * coords_nrow,
            'Cluster 4' : [0] * coords_nrow,
            'Cluster 5' : [0] * coords_nrow,
            'Cluster 6' : [0] * coords_nrow,
            'Cluster 7' : [0] * coords_nrow,
            'Cluster 8' : [0] * coords_nrow}

cell_clusters = pd.DataFrame(data_dict)

cell_num = np.transpose(df_drop).columns.tolist()
clusters = seurat_data['seurat_clusters']

print(cell_clusters.iloc[1745,:])
cell_clusters

cell           1746
x position     5967
y position      411
cluster_num    None
Cluster 0         0
Cluster 1         0
Cluster 2         0
Cluster 3         0
Cluster 4         0
Cluster 5         0
Cluster 6         0
Cluster 7         0
Cluster 8         0
Name: 1745, dtype: object


Unnamed: 0,cell,x position,y position,cluster_num,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,Cluster 8
0,1,101,101,,0,0,0,0,0,0,0,0,0
1,2,101,163,,0,0,0,0,0,0,0,0,0
2,3,101,225,,0,0,0,0,0,0,0,0,0
3,4,101,287,,0,0,0,0,0,0,0,0,0
4,5,101,349,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48395,48396,14859,14611,,0,0,0,0,0,0,0,0,0
48396,48397,14859,14673,,0,0,0,0,0,0,0,0,0
48397,48398,14859,14735,,0,0,0,0,0,0,0,0,0
48398,48399,14859,14797,,0,0,0,0,0,0,0,0,0


In [9]:
y = 0  # index for cell_num
for x in range(coords_nrow):
    if cell_clusters['cell'][x] in cell_num:
        cell_clusters['cluster_num'][x] = clusters[y]
        col_num = clusters[y] + 4
        cell_clusters.iloc[x, col_num] = 1
        y += 1

print(cell_clusters.iloc[1745,:])
cell_clusters

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


cell           1746
x position     5967
y position      411
cluster_num       4
Cluster 0         0
Cluster 1         0
Cluster 2         0
Cluster 3         0
Cluster 4         1
Cluster 5         0
Cluster 6         0
Cluster 7         0
Cluster 8         0
Name: 1745, dtype: object


Unnamed: 0,cell,x position,y position,cluster_num,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,Cluster 8
0,1,101,101,,0,0,0,0,0,0,0,0,0
1,2,101,163,,0,0,0,0,0,0,0,0,0
2,3,101,225,,0,0,0,0,0,0,0,0,0
3,4,101,287,,0,0,0,0,0,0,0,0,0
4,5,101,349,,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48395,48396,14859,14611,,0,0,0,0,0,0,0,0,0
48396,48397,14859,14673,,0,0,0,0,0,0,0,0,0
48397,48398,14859,14735,,0,0,0,0,0,0,0,0,0
48398,48399,14859,14797,,0,0,0,0,0,0,0,0,0


In [10]:
# write data to xlsx and add a default header
target = 'C:/Users/jesse/OneDrive/Documents/Multiplex Lab/Data/cell_spatial.xlsx'
col_names = cell_clusters.columns

writer = pd.ExcelWriter(target, engine='xlsxwriter')
cell_clusters.to_excel(writer, startrow=1, index=False, header=False)

workbook  = writer.book
worksheet = writer.sheets['Sheet1']

for idx, val in enumerate(col_names):
    worksheet.write(0, idx, val)

writer.save()