In [2]:
# Import packages
import re
import os
import copy
from pathlib import Path
import numpy as np
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
import pandas as pd
import random
import datetime
import time
from pdb import set_trace
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import BayesianGaussianMixture
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import gc

In [3]:
# File path
raw_data_path = '/home/ec2-user/Feb2025/labels/'
output_path = '/home/ec2-user/Feb2025/labels/'

#### Marker proteins

In [4]:
# Open the marker protein localization data
LFP = 'markers.txt'
LD_marker = pd.read_csv(filepath_or_buffer=raw_data_path+LFP,sep='\t')

# Data set wrangling
LD_marker.index = LD_marker.loc[:,'Protein']
LD_marker = LD_marker.loc[:,LD_marker.columns!='Protein']

# Remove unclassified class
NotUnclassInd = LD_marker.loc[:,'Localization'] != 'Unclassified'
LD_marker = LD_marker.loc[NotUnclassInd,:]
print(LD_marker.shape)

(3365, 1)


#### MCF7 breast carcinoma

In [4]:
# Open the MCF7 localization data
LFP = 'SubCellBarcode.MCF7.txt'
LD_mcf7 = pd.read_csv(filepath_or_buffer=raw_data_path+LFP,sep='\t')

# Data set wrangling
LD_mcf7.index = LD_mcf7.loc[:,'Protein']
LD_mcf7 = LD_mcf7.loc[:,LD_mcf7.columns!='Protein']

# Remove unclassified class
NotUnclassInd = LD_mcf7.loc[:,'Localization'] != 'Unclassified'
LD_mcf7 = LD_mcf7.loc[NotUnclassInd,:]
print("LD MCF7 total labels")
print(LD_mcf7.shape)

# Remove index that already exists in marker LD
LD_mcf7 = LD_mcf7.loc[~LD_mcf7.index.isin(LD_marker.index),:]
print("LD MCF7 labels after removing existing labels in markers.txt")
print(LD_mcf7.shape)

LD MCF7 total labels
(9445, 1)
LD MCF7 labels after removing existing labels in markers.txt
(6130, 1)


#### H322 lung carcinoma

In [5]:
# Open the H322 localization data
LFP = 'SubCellBarcode.H322.txt'
LD_h322 = pd.read_csv(filepath_or_buffer=raw_data_path+LFP,sep='\t')

# Data set wrangling
LD_h322.index = LD_h322.loc[:,'Protein']
LD_h322 = LD_h322.loc[:,LD_h322.columns!='Protein']

# Remove unclassified class
NotUnclassInd = LD_h322.loc[:,'Localization'] != 'Unclassified'
LD_h322 = LD_h322.loc[NotUnclassInd,:]
print("LD H322 total labels")
print(LD_h322.shape)

# Remove index that already exists in marker LD
LD_h322 = LD_h322.loc[~LD_h322.index.isin(LD_marker.index),:]
print("LD H322 labels after removing existing labels in markers.txt")
print(LD_h322.shape)

LD H322 total labels
(9569, 1)
LD H322 labels after removing existing labels in markers.txt
(6241, 1)


#### U251 brain glioma

In [6]:
# Open the U251 localization data
LFP = 'SubCellBarcode.U251.txt'
LD_u251 = pd.read_csv(filepath_or_buffer=raw_data_path+LFP,sep='\t')

# Data set wrangling
LD_u251.index = LD_u251.loc[:,'Protein']
LD_u251 = LD_u251.loc[:,LD_u251.columns!='Protein']

# Remove unclassified class
NotUnclassInd = LD_u251.loc[:,'Localization'] != 'Unclassified'
LD_u251 = LD_u251.loc[NotUnclassInd,:]
print("LD U251 total labels")
print(LD_u251.shape)

# Remove index that already exists in marker LD
LD_u251 = LD_u251.loc[~LD_u251.index.isin(LD_marker.index),:]
print("LD U251 labels after removing existing labels in markers.txt")
print(LD_u251.shape)

LD U251 total labels
(9087, 1)
LD U251 labels after removing existing labels in markers.txt
(5774, 1)


#### Merge for MCF7

In [None]:
# Merge all the LD_mcf7 with LD_marker, and save as LD_mcf7_all
LD_mcf7_all = pd.concat([LD_marker, LD_mcf7], axis=0)
print(LD_mcf7_all.shape)

(9495, 1)


In [9]:
# in LD_all, change the localization "Nuclear" to "Nucleus"
LD_mcf7_all.loc[LD_mcf7_all.loc[:,'Localization'] == 'Nuclear','Localization'] = 'Nucleus'
LD_mcf7_all_count = LD_mcf7_all.loc[:,'Localization'].value_counts()
print(LD_mcf7_all_count)

Localization
Cytosol         3692
Nucleus         2671
Secretory       2487
Mitochondria     645
Name: count, dtype: int64


In [16]:
LD_mcf7_all.to_csv(output_path+'markers_mcf7_all.txt', sep='\t')

#### Merge for H322

In [None]:
# Merge all the LD_h322 with LD_marker, and save as LD_h322_all
LD_h322_all = pd.concat([LD_marker, LD_h322], axis=0)
print(LD_h322_all.shape)

(9606, 1)


In [14]:
# in LD_all, change the localization "Nuclear" to "Nucleus"
LD_h322_all.loc[LD_h322_all.loc[:,'Localization'] == 'Nuclear','Localization'] = 'Nucleus'
LD_h322_all_count = LD_h322_all.loc[:,'Localization'].value_counts()
print(LD_h322_all_count)

Localization
Cytosol         3925
Nucleus         2524
Secretory       2488
Mitochondria     669
Name: count, dtype: int64


In [17]:
LD_h322_all.to_csv(output_path+'markers_h322_all.txt', sep='\t')

#### Merge for U251

In [7]:
# Merge all the LD_u251 with LD_marker, and save as LD_u251_all
LD_u251_all = pd.concat([LD_marker, LD_u251], axis=0)
print(LD_u251_all.shape)

(9139, 1)


In [8]:
# in LD_all, change the localization "Nuclear" to "Nucleus"
LD_u251_all.loc[LD_u251_all.loc[:,'Localization'] == 'Nuclear','Localization'] = 'Nucleus'
LD_u251_all_count = LD_u251_all.loc[:,'Localization'].value_counts()
print(LD_u251_all_count)

Localization
Cytosol         3511
Nucleus         2701
Secretory       2343
Mitochondria     584
Name: count, dtype: int64


In [9]:
LD_u251_all.to_csv(output_path+'markers_u251_all.txt', sep='\t')