In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import mudata as md
import seaborn as sns
import matplotlib.pyplot as plt

import gc
import sys
sys.path.append('../')

from tqdm import tqdm
from latentcor import latentcor
from scipy.sparse import csr_matrix
from utils.gglasso_pipeline import gg_lasso_network_analysis
from utils.utils import calc_sparsity, fetch_protein_names

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
path_raw_data = "../data/raw/"
experiments = ["MGH66_Abx3_P1_align2._CDS.tsv",
               "MGH66_Abx3_P2_align2._CDS.tsv",
               "MGH66_Abx3_P3_align2._CDS.tsv",
               "MGH66_Abx3_P4_align2._CDS.tsv"]

# P1

In [3]:
adata = sc.read_csv(filename=path_raw_data + experiments[0], delimiter='\t')

In [10]:
obs_tmp = adata.var.copy()
var_tmp = adata.obs.copy()
X_tmp = csr_matrix(adata.X.T)

adata = sc.AnnData(X=X_tmp, obs=obs_tmp, var=var_tmp)
adata

In [19]:
print("Sparity:\t\t", calc_sparsity(adata.X.A))
print("Mean counts per gene:\t", np.mean(np.sum(adata.X.A, axis=1)))
print("Median counts per gene:\t", np.median(np.sum(adata.X.A, axis=1)))
print("Mean genes per cell: \t", np.mean(np.sum(adata.X.A, axis=0)))
print("Median genes per cell:\t", np.median(np.sum(adata.X.A, axis=0)))

Sparity:		 0.9980492107440664
Mean counts per gene:	 8.321548
Median counts per gene:	 6.0
Mean genes per cell: 	 839.40607
Median genes per cell:	 97.5


In [None]:
adata.var["gene"] = [gene[4:] for gene in adata.var.index]
adata.var

In [158]:
#accession_numbers = [gene[4:] for gene in adata.var.index[-10:]]
#accession_numbers = [gene[4:] for gene in adata.var[~adata.var.index.str.startswith("cds-WP")].index.values]
accession_numbers = [gene[4:] for gene in adata.var[adata.var.index.str.startswith("cds-WP")].index.values]
#accession_numbers = adata.var.index.values

In [159]:
len(accession_numbers)

3881

In [None]:
batch_size = 1
gene_to_protein = dict()

for i in tqdm(range(0, len(accession_numbers), batch_size)):
    try:
        batch = accession_numbers[i:i + batch_size]
        protein_names = fetch_protein_names(batch)
        gene_to_protein.update(dict(zip(batch, protein_names)))
        print(len(gene_to_protein))
    except Exception as e:
        print(f"Error fetching batch {batch}: {e}")
        continue

In [129]:
adata.var['long_protein_name'] = adata.var['gene'].map(gene_to_protein)
adata.var['long_protein_name'].fillna("None", inplace=True)
adata.var

Unnamed: 0,gene,long_protein_name
cds-AF52_RS00180,AF52_RS00180,
cds-AF52_RS00275,AF52_RS00275,
cds-AF52_RS00620,AF52_RS00620,
cds-AF52_RS01000,AF52_RS01000,
cds-AF52_RS01015,AF52_RS01015,
...,...,...
cds-WP_187079192.1,WP_187079192.1,WP_187079192.1 IS5-like element IS903B family ...
cds-WP_187079193.1,WP_187079193.1,WP_187079193.1 aspartate aminotransferase fami...
cds-WP_187079194.1,WP_187079194.1,WP_187079194.1 ribokinase [Klebsiella pneumoniae]
cds-WP_187079196.1,WP_187079196.1,WP_187079196.1 6-carboxytetrahydropterin synth...


In [149]:
adata.var['species'] = adata.var['long_protein_name'].str.extract(r'\[([^]]+)\]$')
adata.var.species.value_counts()

Klebsiella                     2399
Klebsiella pneumoniae           769
Enterobacteriaceae              324
Gammaproteobacteria             126
Enterobacterales                 70
Bacteria                         54
Klebsiella/Raoultella group      22
Pseudomonadota                    7
Klebsiella quasipneumoniae        2
Klebsiella grimontii              1
Name: species, dtype: int64

In [150]:
adata.var['protein_name'] = adata.var['long_protein_name'].str.split(' ', 1).str[1].str.rsplit('[', 1).str[0].str.strip()
adata.var

Unnamed: 0,gene,long_protein_name,species,protein_name
cds-AF52_RS00180,AF52_RS00180,,,
cds-AF52_RS00275,AF52_RS00275,,,
cds-AF52_RS00620,AF52_RS00620,,,
cds-AF52_RS01000,AF52_RS01000,,,
cds-AF52_RS01015,AF52_RS01015,,,
...,...,...,...,...
cds-WP_187079192.1,WP_187079192.1,WP_187079192.1 IS5-like element IS903B family ...,Klebsiella pneumoniae,IS5-like element IS903B family transposase
cds-WP_187079193.1,WP_187079193.1,WP_187079193.1 aspartate aminotransferase fami...,Klebsiella pneumoniae,aspartate aminotransferase family protein
cds-WP_187079194.1,WP_187079194.1,WP_187079194.1 ribokinase [Klebsiella pneumoniae],Klebsiella pneumoniae,ribokinase
cds-WP_187079196.1,WP_187079196.1,WP_187079196.1 6-carboxytetrahydropterin synth...,Klebsiella pneumoniae,6-carboxytetrahydropterin synthase QueD


In [152]:
adata.var['rRNA'] = adata.var['long_protein_name'].str.contains(r'30S|50S')
adata.var["rRNA"].value_counts()

False    4005
True       61
Name: rRNA, dtype: int64

In [168]:
adata

AnnData object with n_obs × n_vars = 410143 × 4066
    var: 'gene', 'long_protein_name', 'species', 'protein_name', 'rRNA'

In [167]:
sc.write(adata=adata, filename="../data/preprocessed/MGH66_P1_untreated.h5ad")

# P2

In [3]:
adata = sc.read_csv(filename=path_raw_data + experiments[1], delimiter='\t')

In [4]:
adata

AnnData object with n_obs × n_vars = 3987 × 392941

In [5]:
obs_tmp = adata.var.copy()
var_tmp = adata.obs.copy()
X_tmp = csr_matrix(adata.X.T)

adata = sc.AnnData(X=X_tmp, obs=obs_tmp, var=var_tmp)
adata

AnnData object with n_obs × n_vars = 392941 × 3987

In [6]:
adata.var

cds-AF52_RS00165
cds-AF52_RS00180
cds-AF52_RS00620
cds-AF52_RS01000
cds-AF52_RS01010
...
cds-WP_187079193.1
cds-WP_187079194.1
cds-WP_187079195.1
cds-WP_187079196.1
cds-WP_187079197.1


In [7]:
adata.var["gene"] = [gene[4:] for gene in adata.var.index]
adata.var

Unnamed: 0,gene
cds-AF52_RS00165,AF52_RS00165
cds-AF52_RS00180,AF52_RS00180
cds-AF52_RS00620,AF52_RS00620
cds-AF52_RS01000,AF52_RS01000
cds-AF52_RS01010,AF52_RS01010
...,...
cds-WP_187079193.1,WP_187079193.1
cds-WP_187079194.1,WP_187079194.1
cds-WP_187079195.1,WP_187079195.1
cds-WP_187079196.1,WP_187079196.1


In [8]:
accession_numbers = [gene[4:] for gene in adata.var[adata.var.index.str.startswith("cds-WP")].index.values]
len(accession_numbers)

3800

In [9]:
batch_size = 200
gene_to_protein = dict()

for i in tqdm(range(0, len(accession_numbers), batch_size)):
    try:
        batch = accession_numbers[i:i + batch_size]
        protein_names = fetch_protein_names(batch)
        gene_to_protein.update(dict(zip(batch, protein_names)))
        print(len(gene_to_protein))
    except Exception as e:
        print(f"Error fetching batch {batch}: {e}")
        continue

100%|██████████| 1247/1247 [00:00<00:00, 3135669.72it/s]
  5%|▌         | 1/19 [00:03<01:05,  3.64s/it]

200


100%|██████████| 1304/1304 [00:00<00:00, 3308755.24it/s]
 11%|█         | 2/19 [00:08<01:15,  4.43s/it]

400


100%|██████████| 1237/1237 [00:00<00:00, 3210615.13it/s]
 16%|█▌        | 3/19 [00:14<01:17,  4.86s/it]

600


100%|██████████| 1233/1233 [00:00<00:00, 2487530.94it/s]
 21%|██        | 4/19 [00:17<01:06,  4.46s/it]

800


100%|██████████| 1310/1310 [00:00<00:00, 3093771.53it/s]
 26%|██▋       | 5/19 [00:20<00:52,  3.73s/it]

1000


100%|██████████| 1251/1251 [00:00<00:00, 2241381.59it/s]
 32%|███▏      | 6/19 [00:23<00:46,  3.56s/it]

1200


100%|██████████| 1261/1261 [00:00<00:00, 3260799.84it/s]
 37%|███▋      | 7/19 [00:26<00:40,  3.34s/it]

1400


100%|██████████| 1269/1269 [00:00<00:00, 2274603.32it/s]
 42%|████▏     | 8/19 [00:29<00:35,  3.18s/it]

1599


100%|██████████| 1563/1563 [00:00<00:00, 3138198.73it/s]
 47%|████▋     | 9/19 [00:31<00:30,  3.03s/it]

1799


100%|██████████| 1507/1507 [00:00<00:00, 2860097.80it/s]
 53%|█████▎    | 10/19 [00:34<00:27,  3.01s/it]

1999


100%|██████████| 1436/1436 [00:00<00:00, 3001006.75it/s]
 58%|█████▊    | 11/19 [00:37<00:24,  3.03s/it]

2198


100%|██████████| 1447/1447 [00:00<00:00, 2321789.55it/s]
 63%|██████▎   | 12/19 [00:40<00:21,  3.02s/it]

2398


100%|██████████| 1405/1405 [00:00<00:00, 2965776.10it/s]
 68%|██████▊   | 13/19 [00:43<00:17,  2.96s/it]

2598


100%|██████████| 324/324 [00:00<00:00, 2564065.09it/s]
 74%|███████▎  | 14/19 [00:45<00:12,  2.45s/it]

2639


100%|██████████| 1507/1507 [00:00<00:00, 3198793.59it/s]
 79%|███████▉  | 15/19 [00:48<00:10,  2.62s/it]

2834


100%|██████████| 1620/1620 [00:00<00:00, 3142817.98it/s]
 84%|████████▍ | 16/19 [00:50<00:07,  2.49s/it]

3034


100%|██████████| 1545/1545 [00:00<00:00, 2011859.57it/s]
 89%|████████▉ | 17/19 [00:54<00:05,  2.92s/it]

3233


100%|██████████| 1651/1651 [00:00<00:00, 3018655.58it/s]
 95%|█████████▍| 18/19 [00:57<00:02,  2.93s/it]

3433


100%|██████████| 1545/1545 [00:00<00:00, 2003152.91it/s]
100%|██████████| 19/19 [01:05<00:00,  3.44s/it]

3633





In [11]:
adata.var['long_protein_name'] = adata.var['gene'].map(gene_to_protein)
adata.var['long_protein_name'].fillna("None", inplace=True)
adata.var

Unnamed: 0,gene,long_protein_name
cds-AF52_RS00165,AF52_RS00165,
cds-AF52_RS00180,AF52_RS00180,
cds-AF52_RS00620,AF52_RS00620,
cds-AF52_RS01000,AF52_RS01000,
cds-AF52_RS01010,AF52_RS01010,
...,...,...
cds-WP_187079193.1,WP_187079193.1,WP_187079193.1 aspartate aminotransferase fami...
cds-WP_187079194.1,WP_187079194.1,WP_187079194.1 ribokinase [Klebsiella pneumoniae]
cds-WP_187079195.1,WP_187079195.1,WP_187079195.1 HlyD family efflux transporter ...
cds-WP_187079196.1,WP_187079196.1,WP_187079196.1 6-carboxytetrahydropterin synth...


In [12]:
adata.var['species'] = adata.var['long_protein_name'].str.extract(r'\[([^]]+)\]$')
adata.var.species.value_counts()

Klebsiella                     2291
Klebsiella pneumoniae           741
Enterobacteriaceae              332
Gammaproteobacteria             121
Enterobacterales                 67
Bacteria                         54
Klebsiella/Raoultella group      15
Pseudomonadota                    9
Klebsiella quasipneumoniae        2
Klebsiella grimontii              1
Name: species, dtype: int64

In [13]:
adata.var['protein_name'] = adata.var['long_protein_name'].str.split(' ', 1).str[1].str.rsplit('[', 1).str[0].str.strip()
adata.var

Unnamed: 0,gene,long_protein_name,species,protein_name
cds-AF52_RS00165,AF52_RS00165,,,
cds-AF52_RS00180,AF52_RS00180,,,
cds-AF52_RS00620,AF52_RS00620,,,
cds-AF52_RS01000,AF52_RS01000,,,
cds-AF52_RS01010,AF52_RS01010,,,
...,...,...,...,...
cds-WP_187079193.1,WP_187079193.1,WP_187079193.1 aspartate aminotransferase fami...,Klebsiella pneumoniae,aspartate aminotransferase family protein
cds-WP_187079194.1,WP_187079194.1,WP_187079194.1 ribokinase [Klebsiella pneumoniae],Klebsiella pneumoniae,ribokinase
cds-WP_187079195.1,WP_187079195.1,WP_187079195.1 HlyD family efflux transporter ...,Klebsiella pneumoniae,HlyD family efflux transporter periplasmic ada...
cds-WP_187079196.1,WP_187079196.1,WP_187079196.1 6-carboxytetrahydropterin synth...,Klebsiella pneumoniae,6-carboxytetrahydropterin synthase QueD


In [17]:
adata.var['rRNA'] = adata.var['long_protein_name'].str.contains(r'30S|50S')
adata.var["rRNA"].value_counts()

False    3926
True       61
Name: rRNA, dtype: int64

In [18]:
sc.write(adata=adata, filename="../data/preprocessed/MGH66_P2_meropenem.h5ad")

# P3

In [3]:
adata = sc.read_csv(filename=path_raw_data + experiments[2], delimiter='\t')

In [4]:
adata

AnnData object with n_obs × n_vars = 3391 × 390322

In [5]:
obs_tmp = adata.var.copy()
var_tmp = adata.obs.copy()
X_tmp = csr_matrix(adata.X.T)

adata = sc.AnnData(X=X_tmp, obs=obs_tmp, var=var_tmp)
adata

AnnData object with n_obs × n_vars = 390322 × 3391

In [6]:
adata.var["gene"] = [gene[4:] for gene in adata.var.index]
adata.var

Unnamed: 0,gene
cds-AF52_RS00165,AF52_RS00165
cds-AF52_RS00275,AF52_RS00275
cds-AF52_RS00620,AF52_RS00620
cds-AF52_RS01000,AF52_RS01000
cds-AF52_RS01010,AF52_RS01010
...,...
cds-WP_187079193.1,WP_187079193.1
cds-WP_187079194.1,WP_187079194.1
cds-WP_187079195.1,WP_187079195.1
cds-WP_187079196.1,WP_187079196.1


In [7]:
accession_numbers = [gene[4:] for gene in adata.var[adata.var.index.str.startswith("cds-WP")].index.values]
len(accession_numbers)

3246

In [8]:
batch_size = 200
gene_to_protein = dict()

for i in tqdm(range(0, len(accession_numbers), batch_size)):
    try:
        batch = accession_numbers[i:i + batch_size]
        protein_names = fetch_protein_names(batch)
        gene_to_protein.update(dict(zip(batch, protein_names)))
        print(len(gene_to_protein))
    except Exception as e:
        print(f"Error fetching batch {batch}: {e}")
        continue

100%|██████████| 1251/1251 [00:00<00:00, 2405811.24it/s]
  6%|▌         | 1/17 [00:03<00:57,  3.56s/it]

200


100%|██████████| 1320/1320 [00:00<00:00, 2724646.30it/s]
 12%|█▏        | 2/17 [00:07<01:00,  4.05s/it]

400


100%|██████████| 1279/1279 [00:00<00:00, 2749623.18it/s]
 18%|█▊        | 3/17 [00:11<00:52,  3.75s/it]

600


100%|██████████| 1272/1272 [00:00<00:00, 3181368.33it/s]
 24%|██▎       | 4/17 [00:14<00:47,  3.65s/it]

800


100%|██████████| 1293/1293 [00:00<00:00, 3207117.13it/s]
 29%|██▉       | 5/17 [00:17<00:39,  3.32s/it]

1000


100%|██████████| 1241/1241 [00:00<00:00, 3081782.87it/s]
 35%|███▌      | 6/17 [00:20<00:34,  3.12s/it]

1200


100%|██████████| 1309/1309 [00:00<00:00, 2479830.14it/s]
 41%|████      | 7/17 [00:23<00:30,  3.03s/it]

1399


100%|██████████| 1577/1577 [00:00<00:00, 3228119.77it/s]
 47%|████▋     | 8/17 [00:26<00:28,  3.13s/it]

1599


100%|██████████| 1451/1451 [00:00<00:00, 1754377.37it/s]
 53%|█████▎    | 9/17 [00:30<00:26,  3.34s/it]

1797


100%|██████████| 1469/1469 [00:00<00:00, 2615209.07it/s]
 59%|█████▉    | 10/17 [00:35<00:27,  3.88s/it]

1997


100%|██████████| 1435/1435 [00:00<00:00, 3340081.15it/s]
 65%|██████▍   | 11/17 [00:40<00:24,  4.13s/it]

2197


100%|██████████| 943/943 [00:00<00:00, 2985078.24it/s]
 71%|███████   | 12/17 [00:43<00:20,  4.00s/it]

2315


100%|██████████| 1480/1480 [00:00<00:00, 3105337.63it/s]
 76%|███████▋  | 13/17 [00:48<00:16,  4.22s/it]

2510


100%|██████████| 1615/1615 [00:00<00:00, 2967061.31it/s]
 82%|████████▏ | 14/17 [00:54<00:14,  4.84s/it]

2709


100%|██████████| 1615/1615 [00:00<00:00, 2235577.87it/s]
 88%|████████▊ | 15/17 [00:59<00:09,  4.70s/it]

2909


100%|██████████| 1671/1671 [00:00<00:00, 1697839.63it/s]
 94%|█████████▍| 16/17 [01:03<00:04,  4.50s/it]

3109


100%|██████████| 280/280 [00:00<00:00, 1727066.35it/s]
100%|██████████| 17/17 [01:07<00:00,  3.99s/it]

3155





In [9]:
adata.var['long_protein_name'] = adata.var['gene'].map(gene_to_protein)
adata.var['long_protein_name'].fillna("None", inplace=True)

adata.var['species'] = adata.var['long_protein_name'].str.extract(r'\[([^]]+)\]$')

adata.var['protein_name'] = adata.var['long_protein_name'].str.split(' ', 1).str[1].str.rsplit('[', 1).str[0].str.strip()

adata.var['rRNA'] = adata.var['long_protein_name'].str.contains(r'30S|50S')

adata.var

Unnamed: 0,gene,long_protein_name,species,protein_name,rRNA
cds-AF52_RS00165,AF52_RS00165,,,,False
cds-AF52_RS00275,AF52_RS00275,,,,False
cds-AF52_RS00620,AF52_RS00620,,,,False
cds-AF52_RS01000,AF52_RS01000,,,,False
cds-AF52_RS01010,AF52_RS01010,,,,False
...,...,...,...,...,...
cds-WP_187079193.1,WP_187079193.1,WP_187079193.1 aspartate aminotransferase fami...,Klebsiella pneumoniae,aspartate aminotransferase family protein,False
cds-WP_187079194.1,WP_187079194.1,WP_187079194.1 ribokinase [Klebsiella pneumoniae],Klebsiella pneumoniae,ribokinase,False
cds-WP_187079195.1,WP_187079195.1,WP_187079195.1 HlyD family efflux transporter ...,Klebsiella pneumoniae,HlyD family efflux transporter periplasmic ada...,False
cds-WP_187079196.1,WP_187079196.1,WP_187079196.1 6-carboxytetrahydropterin synth...,Klebsiella pneumoniae,6-carboxytetrahydropterin synthase QueD,False


In [10]:
sc.write(adata=adata, filename="../data/preprocessed/MGH66_P3_ciprofloxacin.h5ad")

# P4

In [3]:
adata = sc.read_csv(filename=path_raw_data + experiments[0], delimiter='\t')

In [4]:
obs_tmp = adata.var.copy()
var_tmp = adata.obs.copy()
X_tmp = csr_matrix(adata.X.T)

adata = sc.AnnData(X=X_tmp, obs=obs_tmp, var=var_tmp)
adata

AnnData object with n_obs × n_vars = 410143 × 4066

In [5]:
adata.var["gene"] = [gene[4:] for gene in adata.var.index]
adata.var

Unnamed: 0,gene
cds-AF52_RS00180,AF52_RS00180
cds-AF52_RS00275,AF52_RS00275
cds-AF52_RS00620,AF52_RS00620
cds-AF52_RS01000,AF52_RS01000
cds-AF52_RS01015,AF52_RS01015
...,...
cds-WP_187079192.1,WP_187079192.1
cds-WP_187079193.1,WP_187079193.1
cds-WP_187079194.1,WP_187079194.1
cds-WP_187079196.1,WP_187079196.1


In [6]:
accession_numbers = [gene[4:] for gene in adata.var[adata.var.index.str.startswith("cds-WP")].index.values]
len(accession_numbers)

3881

In [7]:
batch_size = 200
gene_to_protein = dict()

for i in tqdm(range(0, len(accession_numbers), batch_size)):
    try:
        batch = accession_numbers[i:i + batch_size]
        protein_names = fetch_protein_names(batch)
        gene_to_protein.update(dict(zip(batch, protein_names)))
        print(len(gene_to_protein))
    except Exception as e:
        print(f"Error fetching batch {batch}: {e}")
        continue

100%|██████████| 1237/1237 [00:00<00:00, 927485.53it/s]
  5%|▌         | 1/20 [00:03<01:09,  3.64s/it]

200


100%|██████████| 1291/1291 [00:00<00:00, 3092430.88it/s]
 10%|█         | 2/20 [00:08<01:14,  4.15s/it]

400


100%|██████████| 1243/1243 [00:00<00:00, 3043502.55it/s]
 15%|█▌        | 3/20 [00:12<01:12,  4.27s/it]

600


100%|██████████| 1241/1241 [00:00<00:00, 2663833.81it/s]
 20%|██        | 4/20 [00:21<01:39,  6.22s/it]

800


100%|██████████| 1315/1315 [00:00<00:00, 1250115.54it/s]
 25%|██▌       | 5/20 [00:33<02:02,  8.15s/it]

1000


100%|██████████| 1264/1264 [00:00<00:00, 1414137.17it/s]
 30%|███       | 6/20 [00:36<01:29,  6.37s/it]

1200


100%|██████████| 1239/1239 [00:00<00:00, 1492887.86it/s]
 35%|███▌      | 7/20 [00:40<01:12,  5.60s/it]

1400


100%|██████████| 1302/1302 [00:00<00:00, 3028831.84it/s]
 40%|████      | 8/20 [00:46<01:09,  5.78s/it]

1600


100%|██████████| 1497/1497 [00:00<00:00, 3083925.88it/s]
 45%|████▌     | 9/20 [00:51<01:00,  5.54s/it]

1800


100%|██████████| 1523/1523 [00:00<00:00, 1761700.22it/s]
 50%|█████     | 10/20 [00:57<00:56,  5.69s/it]

2000


100%|██████████| 1464/1464 [00:00<00:00, 2707434.33it/s]
 55%|█████▌    | 11/20 [01:04<00:54,  6.06s/it]

2199


100%|██████████| 1403/1403 [00:00<00:00, 3108615.17it/s]
 60%|██████    | 12/20 [01:10<00:47,  5.93s/it]

2399


100%|██████████| 1391/1391 [00:00<00:00, 3101688.92it/s]
 65%|██████▌   | 13/20 [01:14<00:38,  5.45s/it]

2599


100%|██████████| 840/840 [00:00<00:00, 2013265.92it/s]
 70%|███████   | 14/20 [01:17<00:27,  4.62s/it]

2699


100%|██████████| 1498/1498 [00:00<00:00, 1527983.32it/s]
 75%|███████▌  | 15/20 [01:21<00:23,  4.70s/it]

2894


100%|██████████| 1569/1569 [00:00<00:00, 2687163.32it/s]
 80%|████████  | 16/20 [01:37<00:31,  7.94s/it]

3094


100%|██████████| 1591/1591 [00:00<00:00, 3390821.98it/s]
 85%|████████▌ | 17/20 [01:51<00:29,  9.78s/it]

3293


100%|██████████| 1594/1594 [00:00<00:00, 3214288.74it/s]
 90%|█████████ | 18/20 [02:01<00:19,  9.72s/it]

3493


100%|██████████| 1666/1666 [00:00<00:00, 2886290.98it/s]
 95%|█████████▌| 19/20 [02:07<00:08,  8.65s/it]

3693


100%|██████████| 552/552 [00:00<00:00, 2704738.09it/s]
100%|██████████| 20/20 [02:12<00:00,  6.62s/it]

3774





In [8]:
adata.var['long_protein_name'] = adata.var['gene'].map(gene_to_protein)
adata.var['long_protein_name'].fillna("None", inplace=True)

adata.var['species'] = adata.var['long_protein_name'].str.extract(r'\[([^]]+)\]$')

adata.var['protein_name'] = adata.var['long_protein_name'].str.split(' ', 1).str[1].str.rsplit('[', 1).str[0].str.strip()

adata.var['rRNA'] = adata.var['long_protein_name'].str.contains(r'30S|50S')

adata.var

Unnamed: 0,gene,long_protein_name,species,protein_name,rRNA
cds-AF52_RS00180,AF52_RS00180,,,,False
cds-AF52_RS00275,AF52_RS00275,,,,False
cds-AF52_RS00620,AF52_RS00620,,,,False
cds-AF52_RS01000,AF52_RS01000,,,,False
cds-AF52_RS01015,AF52_RS01015,,,,False
...,...,...,...,...,...
cds-WP_187079192.1,WP_187079192.1,WP_187079192.1 IS5-like element IS903B family ...,Klebsiella pneumoniae,IS5-like element IS903B family transposase,False
cds-WP_187079193.1,WP_187079193.1,WP_187079193.1 aspartate aminotransferase fami...,Klebsiella pneumoniae,aspartate aminotransferase family protein,False
cds-WP_187079194.1,WP_187079194.1,WP_187079194.1 ribokinase [Klebsiella pneumoniae],Klebsiella pneumoniae,ribokinase,False
cds-WP_187079196.1,WP_187079196.1,WP_187079196.1 6-carboxytetrahydropterin synth...,Klebsiella pneumoniae,6-carboxytetrahydropterin synthase QueD,False


In [9]:
sc.write(adata=adata, filename="../data/preprocessed/MGH66_P4_gentamicin.h5ad")