In [2]:
!pip install pytorch_lightning
import pytorch_lightning as pl
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import torch
import torch.nn as nn
from torchmetrics import PearsonCorrCoef, MeanSquaredError
!pip install colorama
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.decomposition import TruncatedSVD

!pip install tensorflow
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.utils import plot_model

%cd /home/jovyan/kaggle/working

%cd robust-pca
import r_pca
%cd ..

def delete_columns_csr(mat, indices):
    """
    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
    """
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    mask = np.ones(mat.shape[1], dtype=bool)
    mask[indices] = False
    return mat[:,mask]

def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

def negative_correlation_loss(y_true, y_pred):
    """Negative correlation loss function for Keras
    
    Precondition:
    y_true.mean(axis=1) == 0
    y_true.std(axis=1) == 1
    
    Returns:
    -1 = perfect positive correlation
    1 = totally negative correlation
    """
    my = K.mean(tf.convert_to_tensor(y_pred), axis=1)
    my = tf.tile(tf.expand_dims(my, axis=1), (1, y_true.shape[1]))
    ym = y_pred - my
    r_num = K.sum(tf.multiply(y_true, ym), axis=1)
    r_den = tf.sqrt(K.sum(K.square(ym), axis=1) * float(y_true.shape[-1]))
    r = tf.reduce_mean(r_num / r_den)
    return - r

def negative_pearson_loss(y_true, y_pred):
    """Negative correlation loss function for Keras
    
    Precondition:
    y_true.mean(axis=1) == 0
    y_true.std(axis=1) == 1
    
    Returns:
    -1 = perfect positive correlation
    1 = totally negative correlation
    """

    vx = y_pred - torch.mean(y_pred,1,keepdim=True)
    vy = y_true - torch.mean(y_true,1,keepdim=True)

    cost = torch.mean(torch.sum(vx * vy,1) / (torch.sqrt(torch.sum(vx ** 2,1)) * torch.sqrt(torch.sum(vy ** 2,1))))

    return - cost

def my_model(n_inputs,n_outputs, reg1, reg2):
    """Sequential neural network
    
    Returns a compiled instance of tensorflow.keras.models.Model.
    """
    activation = 'swish'

    
    inputs = Input(shape=(n_inputs, ))
    x0 = Dense(256, kernel_regularizer=tf.keras.regularizers.l2(reg1),
              activation=activation,
             )(inputs)
    x1 = Dense(256, kernel_regularizer=tf.keras.regularizers.l2(reg1),
              activation=activation,
             )(x0)
    x2 = Dense(256, kernel_regularizer=tf.keras.regularizers.l2(reg1),
              activation=activation,
             )(x1)
    x3 = Dense(128, kernel_regularizer=tf.keras.regularizers.l2(reg1),
              activation=activation,
             )(x2)
    x = Concatenate()([x0, x1, x2, x3])
    x = Dense(n_outputs, kernel_regularizer=tf.keras.regularizers.l2(reg2),
              #activation=activation,
             )(x)
    regressor = Model(inputs, x)
    regressor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LR_START),
                      metrics=[negative_correlation_loss],
                      loss=negative_correlation_loss
                     )
    
    return regressor

# Cross-validation
VERBOSE = 2 # set to 2 for more output, set to 0 for less output
EPOCHS = 1000
N_SPLITS = 3

def fit(X_train, y_train):
    np.random.seed(1)
    tf.random.set_seed(1)

    kf = GroupKFold(n_splits=N_SPLITS)
    score_list = []
    for fold, (idx_tr, idx_va) in enumerate(kf.split(X_train, groups=meta.donor)):
        start_time = datetime.datetime.now()
        model = None
        gc.collect()
        X_tr = X_train[idx_tr]
        y_tr = y_train[idx_tr]
        X_va = X_train[idx_va]
        y_va = y_train[idx_va]

        lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, 
                              patience=4, verbose=VERBOSE)
        es = EarlyStopping(monitor="val_loss",
                          patience=12, 
                          verbose=0,
                          mode="min", 
                          restore_best_weights=True)
        callbacks = [lr, es, tf.keras.callbacks.TerminateOnNaN()]

        # Construct and compile the model
        model = my_model(n_inputs=X_train.shape[1],n_outputs=y_train.shape[1],reg1=reg1,reg2=reg2)

        # Train the model
        history = model.fit(X_tr, y_tr, 
                            validation_data=(X_va, y_va), 
                            epochs=EPOCHS,
                            verbose=VERBOSE,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            callbacks=callbacks)
        del X_tr, y_tr

        # We validate the model
        y_pred = model.predict(X_va, batch_size=X_va.shape[0])
        corrscore = correlation_score(y_va, y_pred)

        print(f"Fold {fold}: {es.stopped_epoch:3} epochs, corr =  {corrscore:.5f}")
        del es, X_va#, y_va, y_va_pred
        score_list.append(corrscore)

        return model

    # Show overall score
    print(f"{Fore.GREEN}{Style.BRIGHT}Average  corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")

/home/jovyan/kaggle/working
/home/jovyan/kaggle/working/robust-pca
/home/jovyan/kaggle/working


In [2]:
%%time

%cd /home/jovyan/kaggle/working

X_train = scipy.sparse.load_npz('./sparse/train_cite_inputs_values.sparse.npz')

to_drop = np.genfromtxt("./sparse/drop_ids.csv", delimiter=',', dtype = int)
X_train = delete_columns_csr(X_train,to_drop)

print(f"X shape: {str(X_train.shape):14} {X_train.size*4/1024/1024/1024:2.3f} GByte")

with np.load('./sparse/train_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index = data['index']

metadata_df = pd.read_csv('../input/open-problems-multimodal/metadata.csv', index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
meta = metadata_df.reindex(cell_index)

X_test = scipy.sparse.load_npz('./sparse/test_cite_inputs_values.sparse.npz')

X_test = delete_columns_csr(X_test,to_drop)

with np.load('./sparse/test_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index_t = data['index']

meta_t = metadata_df.reindex(cell_index_t)

# Apply the singular value decomposition
both = scipy.sparse.vstack([X_train, X_test])
assert both.shape[0] == 119651
print(f"Shape of both before SVD: {both.shape}")

rpca = r_pca.R_pca(both,128)
both = rpca.fit(max_iter=5, iter_print=1)

#print(f"Saving checkpoint")

#np.savetxt("both.csv", both, delimiter=",")

#print(f"Starting SVD")

#svd = TruncatedSVD(n_components=128, random_state=1) # 512 is possible
#both = svd.fit_transform(both)
print(f"Shape of both after SVD:  {both.shape}")
    
# Hstack the svd output with the important features
X_train_svd = both[:70988]
X_test_svd = both[70988:]
del both
#X = np.hstack([X, X0])
#Xt = np.hstack([Xt, X0t])
print(f"Reduced X shape:  {str(X_train_svd.shape):14} {X_train_svd.size*4/1024/1024/1024:2.3f} GByte")
print(f"Reduced Xt shape: {str(X_test_svd.shape):14} {X_test_svd.size*4/1024/1024/1024:2.3f} GByte")

%mkdir RPCA_data
%cd RPCA_data
np.savetxt("cite_train_RPCA.csv", X_train_svd, delimiter=",")
np.savetxt("cite_test_RPCA.csv", X_test_svd, delimiter=",")

/home/jovyan/kaggle/working


KeyboardInterrupt: 

In [2]:
%%time

%cd /home/jovyan/kaggle/working

X_train = scipy.sparse.load_npz('./sparse/train_cite_inputs_values.sparse.npz')

to_drop = np.genfromtxt("./sparse/drop_ids.csv", delimiter=',', dtype = int)
X_train = delete_columns_csr(X_train,to_drop)

print(f"X shape: {str(X_train.shape):14} {X_train.size*4/1024/1024/1024:2.3f} GByte")

with np.load('./sparse/train_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index = data['index']

metadata_df = pd.read_csv('../input/open-problems-multimodal/metadata.csv', index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
meta = metadata_df.reindex(cell_index)

X_test = scipy.sparse.load_npz('./sparse/test_cite_inputs_values.sparse.npz')

X_test = delete_columns_csr(X_test,to_drop)

with np.load('./sparse/test_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index_t = data['index']

meta_t = metadata_df.reindex(cell_index_t)

# Apply the singular value decomposition
both = scipy.sparse.vstack([X_train, X_test])
assert both.shape[0] == 119651
print(f"Shape of both before SVD: {both.shape}")

# rpca = r_pca.R_pca(both,128)
# both = rpca.fit(max_iter=5, iter_print=1)

from scipy.sparse import csr_matrix, linalg, diags

S = csr_matrix(both.shape)
Y = csr_matrix(both.shape)

mu = np.prod(both.shape) / (4 * linalg.norm(both, ord=1))
print(mu)
lmbda = 1 / np.sqrt(np.max(both.shape))
print(lmbda)
mu_inv = 1 / mu

iter = 0
err = np.Inf
Sk = S
Yk = Y
Lk = csr_matrix(both.shape)

_tol = 1E-7 * linalg.norm(both, ord='fro')
print(_tol)

/home/jovyan/kaggle/working
X shape: (70988, 21601) 1.283 GByte
Shape of both before SVD: (119651, 21601)
554.1250846866038
0.0028909583422683546
0.010944092187499999
CPU times: user 25 s, sys: 4.59 s, total: 29.6 s
Wall time: 29.6 s


In [3]:
%%time
U, S, V = linalg.svds(both,128)

CPU times: user 1h 1min 16s, sys: 2min 55s, total: 1h 4min 11s
Wall time: 19min 10s


In [4]:
%mkdir RPCA_data
%cd RPCA_data
np.savetxt("U1.csv", U, delimiter=",")
np.savetxt("S1.csv", S, delimiter=",")
np.savetxt("V1.csv", V, delimiter=",")

mkdir: cannot create directory ‘RPCA_data’: File exists
/home/jovyan/kaggle/working/RPCA_data


In [3]:
%cd /home/jovyan/kaggle/working/RPCA_data

U = np.genfromtxt("U1.csv", delimiter=',')
S = np.genfromtxt("S1.csv", delimiter=',')
V = np.genfromtxt("V1.csv", delimiter=',')

/home/jovyan/kaggle/working/RPCA_data


In [4]:
S = np.sign(S) * np.maximum((np.abs(S) - mu_inv), np.zeros(S.shape))

In [5]:
diags(S)

<128x128 sparse matrix of type '<class 'numpy.float64'>'
	with 128 stored elements (1 diagonals) in DIAgonal format>

In [6]:
L = U @ (diags(S) @ V)

In [7]:
del U,S,V

In [8]:
L

array([[ 5.71700132e-02,  1.41978933e-01, -1.00791496e-02, ...,
         3.54053240e-01,  2.17319636e+00,  4.84886417e-01],
       [ 5.18036978e-02,  2.64625574e-01,  4.12810565e-02, ...,
         4.56472119e-01,  2.06278105e+00,  2.96084534e-01],
       [ 5.68016423e-02,  1.32965517e-01, -5.24839109e-03, ...,
         1.51795016e+00,  7.31147382e-01,  7.78359985e-01],
       ...,
       [ 8.50840442e-02,  1.82267394e-01,  5.10552254e-02, ...,
         1.67171957e+00,  6.17618056e+00,  1.29554421e+00],
       [ 1.12634205e-01,  2.31315162e-01, -3.81147472e-02, ...,
         2.09271656e+00,  9.27427763e-01,  1.19581842e+00],
       [ 2.69219342e-02,  9.98982313e-02, -8.56521756e-03, ...,
         1.70526248e+00,  1.27825704e+00,  1.14548244e+00]])

In [9]:
both_dense = both.toarray()
del both

In [10]:
def shrink(M,tau):
    return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape))
S = shrink(both_dense - L, mu_inv * lmbda)

In [11]:
Y = (both_dense - L - S)

In [12]:
err = np.linalg.norm(Y, ord='fro')
print(err)

0.26521953537997084


In [None]:
Y = mu*Y

In [None]:
U, Sd, V = linalg.svds(both_dense - S + mu_inv * Y,128)
np.savetxt("U2.csv", U, delimiter=",")
np.savetxt("S2.csv", Sd, delimiter=",")
np.savetxt("V2.csv", V, delimiter=",")

In [12]:
type(Y)

scipy.sparse._csr.csr_matrix

In [4]:
%cd /home/jovyan/kaggle/working/RPCA_data

def shrink(M,tau):
    return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape))

if 'both' in locals():
    both_dense = both.toarray()
    del both

U = np.genfromtxt("U1.csv", delimiter=',')
Sd = np.genfromtxt("S1.csv", delimiter=',')
V = np.genfromtxt("V1.csv", delimiter=',')
L = U @ (diags(shrink(Sd, mu_inv)) @ V)
S = shrink(both_dense - L, mu_inv * lmbda)
Y = (both_dense - L - S)
del S,L
np.savetxt("Y1.csv", Y, delimiter=",")

/home/jovyan/kaggle/working/RPCA_data


OSError: [Errno 28] No space left on device

In [3]:
%cd /home/jovyan/kaggle/working/RPCA_data

def shrink(M,tau):
    return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape))

# if 'both' in locals():
#     both_dense = both.toarray()
#     del both
    
both_dense = both.toarray()
U = np.genfromtxt("U1.csv", delimiter=',')
Sd = np.genfromtxt("S1.csv", delimiter=',')
V = np.genfromtxt("V1.csv", delimiter=',')
L = U @ (diags(shrink(Sd, mu_inv)) @ V)
print('here1')
S = shrink(both_dense - L, mu_inv * lmbda)
Y = (both_dense - L - S)
del S,L,U,Sd,V,both_dense    
#Y = np.genfromtxt("Y1.csv", delimiter=',')
U = np.genfromtxt("U2.csv", delimiter=',')
Sd = np.genfromtxt("S2.csv", delimiter=',')
V = np.genfromtxt("V2.csv", delimiter=',')
L = U @ (diags(shrink(Sd, mu_inv)) @ V)
print('here2')
L = both.toarray() - L + mu_inv * Y
S = shrink(L, mu_inv * lmbda)
print('here3')
Y = mu * (L - S)
print('here4')
err = np.linalg.norm(L - mu_inv * Y - S, ord='fro')
print(err)
# del L
# U, Sd, V = linalg.svds(both_dense - S + mu_inv * Y,128)
# np.savetxt("U3.csv", U, delimiter=",")
# np.savetxt("S3.csv", Sd, delimiter=",")
# np.savetxt("V3.csv", V, delimiter=",")

/home/jovyan/kaggle/working/RPCA_data
here1
here2
here3
here4
8.606694724962422e-19


In [None]:
print('here2')
S = shrink(both.toarray() - L + mu_inv * Y, mu_inv * lmbda)
print('here3')
Y = Y + mu * (both.toarray() - L - S)
print('here4')
err = np.linalg.norm(both.toarray() - L - S, ord='fro')
print(err)
# del L

In [4]:
%%time

%cd /home/jovyan/kaggle/working

# X_train = scipy.sparse.load_npz('./sparse/train_cite_inputs_values.sparse.npz')

# to_drop = np.genfromtxt("./sparse/drop_ids.csv", delimiter=',', dtype = int)
# X_train = delete_columns_csr(X_train,to_drop)

# print(f"X shape: {str(X_train.shape):14} {X_train.size*4/1024/1024/1024:2.3f} GByte")

with np.load('./sparse/train_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index = data['index']

metadata_df = pd.read_csv('../input/open-problems-multimodal/metadata.csv', index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
meta = metadata_df.reindex(cell_index)

# X_test = scipy.sparse.load_npz('./sparse/test_cite_inputs_values.sparse.npz')

# X_test = delete_columns_csr(X_test,to_drop)

with np.load('./sparse/test_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index_t = data['index']

meta_t = metadata_df.reindex(cell_index_t)

# # Apply the singular value decomposition
# both = scipy.sparse.vstack([X_train, X_test])
# assert both.shape[0] == 119651
# print(f"Shape of both before SVD: {both.shape}")

U = np.genfromtxt("./RPCA_data/U2.csv", delimiter=',')
Sd = np.genfromtxt("./RPCA_data/S2.csv", delimiter=',')
V = np.genfromtxt("./RPCA_data/V2.csv", delimiter=',')
both = U @ (diags(shrink(Sd, mu_inv)) @ V)

svd = TruncatedSVD(n_components=64, random_state=1) # 512 is possible
both = svd.fit_transform(both)
print(f"Shape of both after SVD:  {both.shape}")
    
# Hstack the svd output with the important features
X_train_svd = both[:70988]
X_test_svd = both[70988:]
del both
#X = np.hstack([X, X0])
#Xt = np.hstack([Xt, X0t])
print(f"Reduced X shape:  {str(X_train_svd.shape):14} {X_train_svd.size*4/1024/1024/1024:2.3f} GByte")
print(f"Reduced Xt shape: {str(X_test_svd.shape):14} {X_test_svd.size*4/1024/1024/1024:2.3f} GByte")

y_train = scipy.sparse.load_npz('./sparse/train_cite_targets_values.sparse.npz')

# Normalize the targets row-wise: This doesn't change the correlations,
# and negative_correlation_loss depends on it
y_train -= y_train.mean(axis=1).reshape(-1, 1)
y_train /= y_train.std(axis=1).reshape(-1, 1)
    
print(f"Y shape: {str(y_train.shape):14} {y_train.size*4/1024/1024/1024:2.3f} GByte")

/home/jovyan/kaggle/working
Shape of both after SVD:  (119651, 64)
Reduced X shape:  (70988, 64)    0.017 GByte
Reduced Xt shape: (48663, 64)    0.012 GByte
Y shape: (70988, 140)   0.037 GByte
CPU times: user 3min 15s, sys: 12.7 s, total: 3min 27s
Wall time: 45.5 s


In [5]:
LR_START = 0.01
BATCH_SIZE = 256
reg1 = 8e-5
reg2 = 2e-5
model = fit(X_train_svd,y_train)

2022-10-13 22:56:32.640165: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-13 22:56:32.641112: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-10-13 22:56:32.641130: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-10-13 22:56:32.642125: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow bin

Epoch 1/1000
181/181 - 3s - loss: -8.2832e-01 - negative_correlation_loss: -8.5247e-01 - val_loss: -8.5993e-01 - val_negative_correlation_loss: -8.7498e-01 - lr: 0.0100 - 3s/epoch - 14ms/step
Epoch 2/1000
181/181 - 1s - loss: -8.8358e-01 - negative_correlation_loss: -8.9415e-01 - val_loss: -8.7059e-01 - val_negative_correlation_loss: -8.7862e-01 - lr: 0.0100 - 1s/epoch - 7ms/step
Epoch 3/1000
181/181 - 1s - loss: -8.8971e-01 - negative_correlation_loss: -8.9663e-01 - val_loss: -8.7566e-01 - val_negative_correlation_loss: -8.8156e-01 - lr: 0.0100 - 1s/epoch - 8ms/step
Epoch 4/1000
181/181 - 1s - loss: -8.9187e-01 - negative_correlation_loss: -8.9741e-01 - val_loss: -8.7558e-01 - val_negative_correlation_loss: -8.8068e-01 - lr: 0.0100 - 1s/epoch - 7ms/step
Epoch 5/1000
181/181 - 1s - loss: -8.9271e-01 - negative_correlation_loss: -8.9766e-01 - val_loss: -8.7743e-01 - val_negative_correlation_loss: -8.8215e-01 - lr: 0.0100 - 1s/epoch - 7ms/step
Epoch 6/1000
181/181 - 1s - loss: -8.9352e-0

In [6]:
!pip install tables
!pip install --user magic-impute

Collecting magic-impute
  Downloading magic_impute-3.0.0-py3-none-any.whl (15 kB)
Collecting scprep>=1.0
  Downloading scprep-1.2.1-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.8/93.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m829.2/829.2 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tasklogger>=1.0.0
  Downloading tasklogger-1.2.0-py3-none-any.whl (14 kB)
Collecting graphtools>=1.4.0
  Downloading graphtools-1.5.2-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting pygsp>=0.5.1
  Downloading PyGSP-0.5.1-py2.py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m62.9 MB/s[0m eta 

In [7]:
%%time

%cd /home/jovyan/kaggle/working

X_train = scipy.sparse.load_npz('./sparse/train_cite_inputs_values.sparse.npz')

to_drop = np.genfromtxt("./sparse/drop_ids.csv", delimiter=',', dtype = int)
X_train = delete_columns_csr(X_train,to_drop)

print(f"X shape: {str(X_train.shape):14} {X_train.size*4/1024/1024/1024:2.3f} GByte")

with np.load('./sparse/train_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index = data['index']

metadata_df = pd.read_csv('../input/open-problems-multimodal/metadata.csv', index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
meta = metadata_df.reindex(cell_index)

X_test = scipy.sparse.load_npz('./sparse/test_cite_inputs_values.sparse.npz')

X_test = delete_columns_csr(X_test,to_drop)

with np.load('./sparse/test_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index_t = data['index']

meta_t = metadata_df.reindex(cell_index_t)

# Apply the singular value decomposition
both = scipy.sparse.vstack([X_train, X_test])
assert both.shape[0] == 119651

/home/jovyan/kaggle/working
X shape: (70988, 21601) 1.283 GByte
CPU times: user 21.8 s, sys: 2.69 s, total: 24.5 s
Wall time: 24.5 s


In [9]:
both_small = both[:10000]

In [11]:
both_small.shape

(10000, 21601)

In [19]:
%cd MAGIC/python

'/home/jovyan/kaggle/working'

In [18]:
!git clone git://github.com/KrishnaswamyLab/MAGIC.git
%cd MAGIC/python
!python setup.py install --user
import magic

Cloning into 'MAGIC'...
fatal: unable to connect to github.com:
github.com[0: 140.82.112.3]: errno=Connection timed out

[Errno 2] No such file or directory: 'MAGIC/python'
/home/jovyan/kaggle/working
python: can't open file '/home/jovyan/kaggle/working/setup.py': [Errno 2] No such file or directory


ModuleNotFoundError: No module named 'magic'

In [21]:
!conda install magic-impute


Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.

PackagesNotFoundError: The following packages are not available from current channels:

  - magic-impute

Current channels:

  - https://conda.anaconda.org/conda-forge/linux-64
  - https://conda.anaconda.org/conda-forge/noarch
  - https://repo.anaconda.com/pkgs/main/linux-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/r/linux-64
  - https://repo.anaconda.com/pkgs/r/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




In [27]:
!git clone https://github.com/KrishnaswamyLab/MAGIC.git

Cloning into 'MAGIC'...
remote: Enumerating objects: 4436, done.[K
remote: Counting objects: 100% (359/359), done.[K
remote: Compressing objects: 100% (171/171), done.[K
remote: Total 4436 (delta 176), reused 330 (delta 175), pack-reused 4077[K
Receiving objects: 100% (4436/4436), 221.54 MiB | 72.13 MiB/s, done.
Resolving deltas: 100% (2314/2314), done.


In [1]:
!pip uninstall pandas -y

Found existing installation: pandas 1.3.5
Uninstalling pandas-1.3.5:
  Successfully uninstalled pandas-1.3.5


In [None]:
!conda uninstall pandas

Collecting package metadata (repodata.json): done
Solving environment: - 

In [4]:
from scipy.sparse import SparseArray

ImportError: cannot import name 'SparseArray' from 'scipy.sparse' (/srv/conda/envs/saturn/lib/python3.9/site-packages/scipy/sparse/__init__.py)

In [5]:
%cd /home/jovyan/kaggle/working
%cd MAGIC/python
#!pip install pandas 1.3.5
!python setup.py install --user

import magic
magic_operator = magic.MAGIC(random_state=32)
denoised=magic_operator.fit_transform(both_small)

/home/jovyan/kaggle/working
/home/jovyan/kaggle/working/MAGIC/python
running install
running bdist_egg
running egg_info
writing magic_impute.egg-info/PKG-INFO
writing dependency_links to magic_impute.egg-info/dependency_links.txt
writing requirements to magic_impute.egg-info/requires.txt
writing top-level names to magic_impute.egg-info/top_level.txt
reading manifest file 'magic_impute.egg-info/SOURCES.txt'
writing manifest file 'magic_impute.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/magic
copying build/lib/magic/version.py -> build/bdist.linux-x86_64/egg/magic
copying build/lib/magic/magic.py -> build/bdist.linux-x86_64/egg/magic
copying build/lib/magic/__init__.py -> build/bdist.linux-x86_64/egg/magic
copying build/lib/magic/plot.py -> build/bdist.linux-x86_64/egg/magic
copying build/lib/magic/utils.py -> build/bdist.linux-x86_64/egg/magi

AttributeError: module 'sparse' has no attribute 'SparseArray'

In [6]:
!pip install --user magic-impute

Collecting pandas>=0.25
  Using cached pandas-1.3.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.5 MB)
Installing collected packages: pandas
Successfully installed pandas-1.3.5


In [7]:
import magic

AttributeError: module 'sparse' has no attribute 'SparseArray'

In [8]:
import sparse

In [9]:
sparse.SparseArray

AttributeError: module 'sparse' has no attribute 'SparseArray'

In [2]:
!pip install magic-impute

Collecting scipy>=1.1.0
  Downloading scipy-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.8/33.8 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: scipy
Successfully installed scipy-1.9.2


In [4]:
import magic

In [11]:
import sparse

In [18]:
import sparse.SparseArray

ModuleNotFoundError: No module named 'sparse.SparseArray'

In [25]:
sparse.__builtins__()

AttributeError: module 'sparse' has no attribute '__builtins__'

In [37]:
!pip install sparse

Collecting sparse
  Using cached sparse-0.13.0-py2.py3-none-any.whl (77 kB)
Installing collected packages: sparse
Successfully installed sparse-0.13.0


In [47]:
!pip uninstall sparse -y

Found existing installation: sparse 0.13.0
Uninstalling sparse-0.13.0:
  Successfully uninstalled sparse-0.13.0


In [1]:
!conda install sparse -y

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.12.0
  latest version: 22.9.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /opt/saturncloud/envs/saturn

  added / updated specs:
    - sparse


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2022.9.24  |       ha878542_0         150 KB  conda-forge
    certifi-2022.9.24          |     pyhd8ed1ab_0         155 KB  conda-forge
    sparse-0.13.0              |     pyhd8ed1ab_0          59 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         364 KB

The following NEW packages will be INSTALLED:

  sparse             conda-forge/noarch::sparse-0.13.0-pyhd8ed1ab_0

The following packages will be UPDATED:

  

In [3]:
import os
import sparse
os.path.dirname(sparse.__file__)

'/srv/conda/envs/saturn/lib/python3.9/site-packages/sparse'

In [42]:
import os.path, pkgutil
import sparse
pkgpath = os.path.dirname(sparse.__file__)
print([name for _, name, _ in pkgutil.iter_modules([pkgpath])])

TypeError: expected str, bytes or os.PathLike object, not NoneType

In [27]:
!pip uninstall sparse -y

Found existing installation: sparse 0.13.0
Uninstalling sparse-0.13.0:
  Successfully uninstalled sparse-0.13.0


In [31]:
import pandas

In [32]:
import pkgutil
[name for _, name, _ in pkgutil.iter_modules(['pandas'])]

[]

In [23]:
dir(sparse)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__']

In [22]:
SparseArray

NameError: name 'SparseArray' is not defined

In [20]:
from sparse import *

In [12]:
sparse.SparseArray

AttributeError: module 'sparse' has no attribute 'SparseArray'

In [9]:
!pip install sparse

Collecting sparse
  Downloading sparse-0.13.0-py2.py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.8/77.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sparse
Successfully installed sparse-0.13.0


In [8]:
import pydata.sparse as sparse

ModuleNotFoundError: No module named 'pydata.sparse'

import pydata

In [5]:
!pip install pydata 

Collecting pydata
  Downloading pydata-1.0.0-py3-none-any.whl (5.6 kB)
Collecting lxml
  Downloading lxml-4.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: lxml, pydata
Successfully installed lxml-4.9.1 pydata-1.0.0


In [None]:
!pip uninstall scipy

Found existing installation: scipy 1.8.1
Uninstalling scipy-1.8.1:
  Would remove:
    /opt/saturncloud/envs/saturn/lib/python3.9/site-packages/scipy-1.8.1.dist-info/*
    /opt/saturncloud/envs/saturn/lib/python3.9/site-packages/scipy/*
Proceed (Y/n)? 

In [1]:
!pip uninstall scipy -y

Found existing installation: scipy 1.8.1
Uninstalling scipy-1.8.1:
  Successfully uninstalled scipy-1.8.1


In [11]:
mda_data.shape

(10000, 20372)

In [12]:
both_small.shape

(10000, 21601)

In [10]:
import scprep
mda_data = scprep.filter.remove_empty_genes(both_small)

In [13]:
denoised.shape

(10000, 21601)

In [8]:
import magic
both_small = both[:10000]
magic_operator = magic.MAGIC(random_state=32)
denoised=magic_operator.fit_transform(both_small)

Calculating MAGIC...
  Running MAGIC on 10000 cells and 21601 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 60.58 seconds.
    Calculating KNN search...
    Calculated KNN search in 11.29 seconds.
    Calculating affinities...
    Calculated affinities in 11.28 seconds.
  Calculated graph and diffusion operator in 83.16 seconds.
  Running MAGIC with `solver='exact'` on 21601-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.




  Calculating imputation...
  Calculated imputation in 7.92 seconds.
Calculated MAGIC in 91.87 seconds.


In [14]:
%pwd

'/home/jovyan/kaggle/working'

In [15]:
both.shape

(119651, 21601)

In [16]:
import scprep
import magic
both = scprep.filter.remove_empty_genes(both)
magic_operator = magic.MAGIC(random_state=32)
both = magic_operator.fit_transform(both)

svd = TruncatedSVD(n_components=64, random_state=1) # 512 is possible
both = svd.fit_transform(both)
print(f"Shape of both after SVD:  {both.shape}")
    
# Hstack the svd output with the important features
X_train_svd = both[:70988]
X_test_svd = both[70988:]
del both
%mkdir magic_data
%cd magic_data
np.savetxt("X_train_64.csv", X_train_svd, delimiter=",")
np.savetxt("X_test_64.csv", X_test_svd, delimiter=",")

Calculating MAGIC...
  Running MAGIC on 119651 cells and 21601 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 674.53 seconds.
    Calculating KNN search...
    Calculated KNN search in 2310.34 seconds.
    Calculating affinities...




    Calculated affinities in 2302.04 seconds.
  Calculated graph and diffusion operator in 5287.08 seconds.
  Running MAGIC with `solver='exact'` on 21601-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.




  Calculating imputation...
  Calculated imputation in 226.39 seconds.
Calculated MAGIC in 5523.07 seconds.
Shape of both after SVD:  (119651, 64)
/home/jovyan/kaggle/working/magic_data


In [17]:
%%time

%cd /home/jovyan/kaggle/working

X_train = scipy.sparse.load_npz('./sparse/train_cite_inputs_values.sparse.npz')

to_drop = np.genfromtxt("./sparse/drop_ids.csv", delimiter=',', dtype = int)
X_train = delete_columns_csr(X_train,to_drop)

print(f"X shape: {str(X_train.shape):14} {X_train.size*4/1024/1024/1024:2.3f} GByte")

with np.load('./sparse/train_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index = data['index']

metadata_df = pd.read_csv('../input/open-problems-multimodal/metadata.csv', index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
meta = metadata_df.reindex(cell_index)

X_test = scipy.sparse.load_npz('./sparse/test_cite_inputs_values.sparse.npz')

X_test = delete_columns_csr(X_test,to_drop)

with np.load('./sparse/test_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index_t = data['index']

meta_t = metadata_df.reindex(cell_index_t)

# Apply the singular value decomposition
both = scipy.sparse.vstack([X_train, X_test])
assert both.shape[0] == 119651

both = magic_operator.transform(both)

svd = TruncatedSVD(n_components=256, random_state=1) # 512 is possible
both_256 = svd.fit_transform(both)
print(f"Shape of both after SVD:  {both_256.shape}")

X_train_svd = both_256[:70988]
X_test_svd = both_256[70988:]

np.savetxt("X_train_256.csv", X_train_svd, delimiter=",")
np.savetxt("X_test_256.csv", X_test_svd, delimiter=",")

/home/jovyan/kaggle/working
X shape: (70988, 21601) 1.283 GByte
Running MAGIC with `solver='exact'` on 21601-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.




Calculating imputation...
Calculated imputation in 226.84 seconds.
Shape of both after SVD:  (119651, 256)
CPU times: user 10min 23s, sys: 1min 1s, total: 11min 25s
Wall time: 5min 26s


In [18]:
%pwd

'/home/jovyan/kaggle/working'

In [19]:
%cd magic_data

/home/jovyan/kaggle/working/magic_data


In [20]:
filename = 'magic_model.sav'
pickle.dump(magic_operator, open(filename, 'wb'))

OSError: [Errno 28] No space left on device

In [21]:
np.savetxt("X_train_256.csv", X_train_svd, delimiter=",")
np.savetxt("X_test_256.csv", X_test_svd, delimiter=",")

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


OSError: [Errno 28] No space left on device

In [24]:
%cd /home/jovyan/kaggle/working
y_train = scipy.sparse.load_npz('./sparse/train_cite_targets_values.sparse.npz')

# Normalize the targets row-wise: This doesn't change the correlations,
# and negative_correlation_loss depends on it
y_train -= y_train.mean(axis=1).reshape(-1, 1)
y_train /= y_train.std(axis=1).reshape(-1, 1)

/home/jovyan/kaggle/working


In [25]:
LR_START = 0.01
BATCH_SIZE = 256
reg1 = 8e-5
reg2 = 2e-5
model = fit(X_train_svd,y_train)

2022-10-14 03:13:05.516379: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-14 03:13:05.517295: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-10-14 03:13:05.517317: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-10-14 03:13:05.517573: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow bin

Epoch 1/1000
181/181 - 3s - loss: -8.2595e-01 - negative_correlation_loss: -8.5186e-01 - val_loss: -8.5754e-01 - val_negative_correlation_loss: -8.7247e-01 - lr: 0.0100 - 3s/epoch - 15ms/step
Epoch 2/1000
181/181 - 2s - loss: -8.7931e-01 - negative_correlation_loss: -8.9021e-01 - val_loss: -8.6614e-01 - val_negative_correlation_loss: -8.7429e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 3/1000
181/181 - 2s - loss: -8.8507e-01 - negative_correlation_loss: -8.9215e-01 - val_loss: -8.7085e-01 - val_negative_correlation_loss: -8.7700e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 4/1000
181/181 - 2s - loss: -8.8720e-01 - negative_correlation_loss: -8.9303e-01 - val_loss: -8.6951e-01 - val_negative_correlation_loss: -8.7484e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 5/1000
181/181 - 2s - loss: -8.8817e-01 - negative_correlation_loss: -8.9333e-01 - val_loss: -8.7246e-01 - val_negative_correlation_loss: -8.7734e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 6/1000
181/181 - 2s - loss: -8.8911e-0

In [26]:
%%time

%cd /home/jovyan/kaggle/working

X_train = scipy.sparse.load_npz('./sparse/train_cite_inputs_values.sparse.npz')

to_drop = np.genfromtxt("./sparse/drop_ids.csv", delimiter=',', dtype = int)
X_train = delete_columns_csr(X_train,to_drop)

print(f"X shape: {str(X_train.shape):14} {X_train.size*4/1024/1024/1024:2.3f} GByte")

with np.load('./sparse/train_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index = data['index']

metadata_df = pd.read_csv('../input/open-problems-multimodal/metadata.csv', index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
meta = metadata_df.reindex(cell_index)

X_test = scipy.sparse.load_npz('./sparse/test_cite_inputs_values.sparse.npz')

X_test = delete_columns_csr(X_test,to_drop)

with np.load('./sparse/test_cite_inputs_idxcol.npz',allow_pickle=True) as data:
    cell_index_t = data['index']

meta_t = metadata_df.reindex(cell_index_t)

# Apply the singular value decomposition
both = scipy.sparse.vstack([X_train, X_test])
assert both.shape[0] == 119651

svd = TruncatedSVD(n_components=256, random_state=1) # 512 is possible
both_256 = svd.fit_transform(both)
print(f"Shape of both after SVD:  {both_256.shape}")

X_train_svd = both_256[:70988]
X_test_svd = both_256[70988:]


/home/jovyan/kaggle/working
X shape: (70988, 21601) 1.283 GByte
Shape of both after SVD:  (119651, 256)
CPU times: user 25min 30s, sys: 7.97 s, total: 25min 38s
Wall time: 24min 58s


In [27]:
LR_START = 0.01
BATCH_SIZE = 256
reg1 = 8e-5
reg2 = 2e-5
model = fit(X_train_svd,y_train)

Epoch 1/1000
181/181 - 2s - loss: -8.2551e-01 - negative_correlation_loss: -8.5061e-01 - val_loss: -8.5943e-01 - val_negative_correlation_loss: -8.7513e-01 - lr: 0.0100 - 2s/epoch - 13ms/step
Epoch 2/1000
181/181 - 2s - loss: -8.8192e-01 - negative_correlation_loss: -8.9427e-01 - val_loss: -8.6866e-01 - val_negative_correlation_loss: -8.7874e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 3/1000
181/181 - 2s - loss: -8.8779e-01 - negative_correlation_loss: -8.9670e-01 - val_loss: -8.7420e-01 - val_negative_correlation_loss: -8.8196e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 4/1000
181/181 - 2s - loss: -8.9012e-01 - negative_correlation_loss: -8.9755e-01 - val_loss: -8.7356e-01 - val_negative_correlation_loss: -8.8051e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 5/1000
181/181 - 2s - loss: -8.9129e-01 - negative_correlation_loss: -8.9795e-01 - val_loss: -8.7669e-01 - val_negative_correlation_loss: -8.8310e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 6/1000
181/181 - 2s - loss: -8.9212e-0

In [28]:
LR_START = 0.01
BATCH_SIZE = 256
reg1 = 8e-6
reg2 = 2e-6
model = fit(X_train_svd,y_train)

Epoch 1/1000
181/181 - 2s - loss: -8.5992e-01 - negative_correlation_loss: -8.6672e-01 - val_loss: -8.7606e-01 - val_negative_correlation_loss: -8.8229e-01 - lr: 0.0100 - 2s/epoch - 13ms/step
Epoch 2/1000
181/181 - 2s - loss: -8.9232e-01 - negative_correlation_loss: -8.9810e-01 - val_loss: -8.7818e-01 - val_negative_correlation_loss: -8.8357e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 3/1000
181/181 - 2s - loss: -8.9446e-01 - negative_correlation_loss: -8.9964e-01 - val_loss: -8.7823e-01 - val_negative_correlation_loss: -8.8309e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 4/1000
181/181 - 2s - loss: -8.9559e-01 - negative_correlation_loss: -9.0043e-01 - val_loss: -8.7927e-01 - val_negative_correlation_loss: -8.8392e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 5/1000
181/181 - 2s - loss: -8.9635e-01 - negative_correlation_loss: -9.0091e-01 - val_loss: -8.8146e-01 - val_negative_correlation_loss: -8.8585e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 6/1000
181/181 - 2s - loss: -8.9694e-0

In [29]:
LR_START = 0.01
BATCH_SIZE = 256
reg1 = 8e-4
reg2 = 2e-4
model = fit(X_train_svd,y_train)

Epoch 1/1000
181/181 - 2s - loss: -7.8600e-01 - negative_correlation_loss: -8.5516e-01 - val_loss: -8.6023e-01 - val_negative_correlation_loss: -8.7712e-01 - lr: 0.0100 - 2s/epoch - 13ms/step
Epoch 2/1000
181/181 - 2s - loss: -8.7947e-01 - negative_correlation_loss: -8.9287e-01 - val_loss: -8.6505e-01 - val_negative_correlation_loss: -8.7628e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 3/1000
181/181 - 2s - loss: -8.8318e-01 - negative_correlation_loss: -8.9377e-01 - val_loss: -8.6842e-01 - val_negative_correlation_loss: -8.7823e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 4/1000
181/181 - 2s - loss: -8.8460e-01 - negative_correlation_loss: -8.9429e-01 - val_loss: -8.6859e-01 - val_negative_correlation_loss: -8.7817e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 5/1000
181/181 - 2s - loss: -8.8494e-01 - negative_correlation_loss: -8.9434e-01 - val_loss: -8.6961e-01 - val_negative_correlation_loss: -8.7886e-01 - lr: 0.0100 - 2s/epoch - 9ms/step
Epoch 6/1000
181/181 - 2s - loss: -8.8580e-0

In [30]:
%pwd

'/home/jovyan/kaggle/working'

In [31]:
%cd magic_data
%ls

/home/jovyan/kaggle/working/magic_data
magic_model.sav  X_test_64.csv  X_train_256.csv  X_train_64.csv


In [32]:
%rm magic_model.sav

In [33]:
%cd ..
%mkdir pca64
%cd pca64
np.savetxt("X_train_64.csv", X_train_svd, delimiter=",")
np.savetxt("X_test_64.csv", X_test_svd, delimiter=",")

/home/jovyan/kaggle/working
/home/jovyan/kaggle/working/pca64


In [34]:
%cd ..
%pwd

/home/jovyan/kaggle/working


'/home/jovyan/kaggle/working'

def unique_row_view(data):
    b = np.ascontiguousarray(data).view(
        np.dtype((np.void, data.dtype.itemsize * data.shape[1]))
    )
    u = np.unique(b).view(data.dtype).reshape(-1, data.shape[1])
    return u
abc = np.array([[1,2,3],[2,3,4],[1,2,3]])
unique_row_view(abc)

In [38]:
abc = np.array([[0,2,3],[0,3,4],[0,4,5]])
scprep.filter.remove_empty_genes(abc)

array([[2, 3],
       [3, 4],
       [4, 5]])

In [39]:
%pwd

'/home/jovyan/kaggle/working'

In [45]:
def unique_row_view(data):
    b = np.ascontiguousarray(data).view(
        np.dtype((np.void, data.dtype.itemsize * data.shape[1]))
    )
    u = np.unique(b).view(data.dtype).reshape(-1, data.shape[1])
    return u

train = scipy.sparse.load_npz("./sparse/train_multi_inputs_values.sparse.npz")
test = scipy.sparse.load_npz("./sparse/test_multi_inputs_values.sparse.npz")
both = scipy.sparse.vstack([train, test])
both = scprep.filter.remove_empty_genes(both)

In [54]:
both.getformat()

'csr'

In [None]:
import scipy.sparse as sp

def sp_unique(sp_matrix, axis=0):
    ''' Returns a sparse matrix with the unique rows (axis=0)
    or columns (axis=1) of an input sparse matrix sp_matrix'''
    if axis == 1:
        sp_matrix = sp_matrix.T

    old_format = sp_matrix.getformat()
    dt = np.dtype(sp_matrix)
    ncols = sp_matrix.shape[1]

    if old_format != 'lil':
        sp_matrix = sp_matrix.tolil()

    _, ind = np.unique(sp_matrix.data + sp_matrix.rows, return_index=True)
    rows = sp_matrix.rows[ind]
    data = sp_matrix.data[ind]
    nrows_uniq = data.shape[0]

    sp_matrix = sp.lil_matrix((nrows_uniq, ncols), dtype=dt)  #  or sp_matrix.resize(nrows_uniq, ncols)
    sp_matrix.data = data
    sp_matrix.rows = rows

    ret = sp_matrix.asformat(old_format)
    if axis == 1:
        ret = ret.T        
    return ret

both = sp_unique(both)

In [None]:
pca_train = TruncatedSVD(n_components=128, random_state=1)
both = pca_train.fit_transform(both)

%mkdir multi_pca
%cd multi_pca

np.savetxt("both_64.csv", both, delimiter=",")

In [6]:
def remove_duplicate_rows(data):
    unique_row_indices, unique_columns = [], []
    for row_idx, row in enumerate(data):
        indices = row.indices.tolist()
        if indices not in unique_columns:
            unique_columns.append(indices)
            unique_row_indices.append(row_idx)
    return data[unique_row_indices]
remove_duplicate_rows(train)

<105942x228942 sparse matrix of type '<class 'numpy.float32'>'
	with 607301546 stored elements in Compressed Sparse Row format>

In [7]:
train

<105942x228942 sparse matrix of type '<class 'numpy.float32'>'
	with 607301546 stored elements in Compressed Sparse Row format>

In [5]:
train = scipy.sparse.load_npz("./sparse/train_multi_inputs_values.sparse.npz")

In [8]:
train = scipy.sparse.load_npz("./sparse/train_multi_targets_values.sparse.npz")
def remove_duplicate_rows(data):
    unique_row_indices, unique_columns = [], []
    for row_idx, row in enumerate(data):
        indices = row.indices.tolist()
        if indices not in unique_columns:
            unique_columns.append(indices)
            unique_row_indices.append(row_idx)
    return data[unique_row_indices]
remove_duplicate_rows(train)

<105942x23418 sparse matrix of type '<class 'numpy.float32'>'
	with 407024875 stored elements in Compressed Sparse Row format>

In [9]:
train

<105942x23418 sparse matrix of type '<class 'numpy.float32'>'
	with 407024875 stored elements in Compressed Sparse Row format>

In [10]:
%cd /home/jovyan/kaggle/working
import scprep

train = scipy.sparse.load_npz("./sparse/train_multi_inputs_values.sparse.npz")
test = scipy.sparse.load_npz("./sparse/test_multi_inputs_values.sparse.npz")
both = scipy.sparse.vstack([train, test])
both = scprep.filter.remove_empty_genes(both)
pca_train = TruncatedSVD(n_components=128, random_state=1)
both = pca_train.fit_transform(both)

%mkdir multi_pca
%cd multi_pca

np.savetxt("inputs_both_128.csv", both, delimiter=",")

/home/jovyan/kaggle/working
/home/jovyan/kaggle/working/multi_pca


FileNotFoundError: [Errno 2] No such file or directory: './sparse/train_multi_targets_values.sparse.npz'

In [13]:
%ls ./sparse/

drop_ids.csv                         train_cite_inputs_idxcol.npz
evaluation.parquet                   train_cite_inputs_values.sparse.npz
metadata.parquet                     train_cite_targets_idxcol.npz
sample_submission.parquet            train_cite_targets_values.sparse.npz
test_cite_inputs_idxcol.npz          train_multi_inputs_idxcol.npz
test_cite_inputs_values.sparse.npz   train_multi_inputs_values.sparse.npz
test_multi_inputs_idxcol.npz         train_multi_targets_idxcol.npz
test_multi_inputs_values.sparse.npz  train_multi_targets_values.sparse.npz


In [15]:
%cd /home/jovyan/kaggle/working

targets = scipy.sparse.load_npz("./sparse/train_multi_targets_values.sparse.npz")
targets = scprep.filter.remove_empty_genes(targets)
pca_train = TruncatedSVD(n_components=128, random_state=1)
targets = pca_train.fit_transform(targets)

%cd multi_pca

np.savetxt("targets_128.csv", targets, delimiter=",")

/home/jovyan/kaggle/working
/home/jovyan/kaggle/working/multi_pca


In [2]:
%cd /home/jovyan/kaggle/working
%cd multi_pca
%ls

/home/jovyan/kaggle/working
/home/jovyan/kaggle/working/multi_pca
inputs_both_128.csv  targets_128.csv
