In [None]:
## INSTALL REQUIRED PACKAGES
! pip install anndata hdf5plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdf5plugin
  Downloading hdf5plugin-3.3.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.7 MB)
[K     |████████████████████████████████| 9.7 MB 3.6 MB/s 
Installing collected packages: hdf5plugin
Successfully installed hdf5plugin-3.3.1


In [None]:
## IMPORTS
import pandas as pd
import numpy as np
import anndata as ad

import h5py
import hdf5plugin
import os

from google.colab import drive

In [None]:
## MOUNT GOOGLE DRIVE
drive.mount('/content/drive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
## CONSTANTS
DATA_DIR = '/content/drive/My Drive/Thesis/dance/dance/data'
SUBTASK_NAME = 'openproblems_competition_2022_cite'
RANDOM_SEED = 123

In [None]:
# Load the dataset into a numpy array
rna_file = h5py.File(os.path.join(DATA_DIR, 'train_cite_inputs.h5'), 'r')
rna_vals = rna_file['train_cite_inputs']['block0_values']

In [None]:
# Create shuffled indexes in order to partition the data into train and test set
if RANDOM_SEED:
    np.random.seed(RANDOM_SEED)
ten_percent_part = round(rna_vals.shape[0] * 0.1)
idx = np.random.permutation(rna_vals.shape[0])
train_idx = sorted(idx[:-ten_percent_part])
test_idx = sorted(idx[-ten_percent_part:])

In [None]:
# Extract essential information from file and delete
train_obs_names = rna_file['train_cite_inputs']['axis1'][train_idx]
train_var_names = rna_file['train_cite_inputs']['axis0']

test_obs_names = rna_file['train_cite_inputs']['axis1'][test_idx]
test_var_names = rna_file['train_cite_inputs']['axis0']

In [None]:
# Filter metadata to select the train donors and days
metadata = pd.read_csv(os.path.join(DATA_DIR, 'metadata.csv'))
cite_metadata = metadata[metadata['technology'] == 'citeseq']
cite_metadata = cite_metadata.loc[(cite_metadata['day'].isin([2, 3, 4, 5])) & (cite_metadata['donor'].isin([13176, 31800, 32606]))]
cite_metadata = cite_metadata.drop('technology', axis=1)
cite_metadata.set_index('cell_id', inplace=True)

In [None]:
# Put numpy array into AnnData object
cite_train_mod1 = ad.AnnData(rna_vals[train_idx])
cite_test_mod1 = ad.AnnData(rna_vals[test_idx])

In [None]:
# Add axis names
cite_train_mod1.obs_names = train_obs_names
cite_train_mod1.var_names = train_var_names

cite_test_mod1.obs_names = test_obs_names
cite_test_mod1.var_names = test_var_names

In [None]:
# Add metadata DataFrame to AnnData obs
cite_train_mod1.obs = cite_metadata.iloc[train_idx]
cite_test_mod1.obs = cite_metadata.iloc[test_idx]

In [None]:
os.makedirs(os.path.join(DATA_DIR, SUBTASK_NAME), exist_ok=True)
cite_train_mod1.write(os.path.join(DATA_DIR, SUBTASK_NAME, f'{SUBTASK_NAME}.censor_dataset.output_train_mod1.h5ad'))
cite_test_mod1.write(os.path.join(DATA_DIR, SUBTASK_NAME, f'{SUBTASK_NAME}.censor_dataset.output_test_mod1.h5ad'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c


In [None]:
del rna_file
del cite_train_mod1
del cite_test_mod1

In [None]:
prot_file = h5py.File(os.path.join(DATA_DIR, 'train_cite_targets.h5'), 'r')
prot_vals = prot_file['train_cite_targets']['block0_values']

In [None]:
# Put numpy array into AnnData object
cite_train_mod2 = ad.AnnData(prot_vals[train_idx])
cite_test_mod2 = ad.AnnData(prot_vals[test_idx])

In [None]:
# Add axis names
cite_train_mod2.obs_names = prot_file['train_cite_targets']['axis1'][train_idx]
cite_train_mod2.var_names = prot_file['train_cite_targets']['axis0']

cite_test_mod2.obs_names = prot_file['train_cite_targets']['axis1'][test_idx]
cite_test_mod2.var_names = prot_file['train_cite_targets']['axis0']

In [None]:
# Add metadata DataFrame to AnnData obs
cite_train_mod2.obs = cite_metadata.iloc[train_idx]
cite_test_mod2.obs = cite_metadata.iloc[test_idx]

In [None]:
cite_train_mod2.write(os.path.join(DATA_DIR, SUBTASK_NAME, f'{SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'))
cite_test_mod2.write(os.path.join(DATA_DIR, SUBTASK_NAME, f'{SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[key] = c


In [None]:
del prot_file
del cite_train_mod2
del cite_test_mod2