In [40]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso


class Config:
    uc_path = 'dataset/GSE114374_Human_UC_expression_matrix.txt'
    hc_path = 'dataset/GSE114374_Human_HC_expression_matrix.txt'


def gt_genes():
    return ['IL23R', 'NOD2', 'TNF', 'IL1B', 'IL10', 'PTPN2', 'IRF5', 'ABCB1', 'IL6', 'HLA-DRB1']

        

def load_dataset(args, size=None):
    """
        uc, hc  : pd.DataFrame (samples x genes)
        i2g     : index to gene name
        g2i     : gene name to index
    """
    uc = pd.read_csv(args.uc_path, sep='\t').transpose()
    hc = pd.read_csv(args.hc_path, sep='\t').transpose()
    if size is not None:
        uc = uc.iloc[:size]
        hc = hc.iloc[:size]
    i2g = uc.columns
    g2i = {i2g[k]: k for k in range(len(i2g))}
    return uc, hc, i2g, g2i


def baseline_method(hc, uc):
    uc_stat = uc.mean(axis=0)
    hc_stat = hc.mean(axis=0)
    mean = np.abs((uc_stat - hc_stat).to_numpy())
    targets = mean
    return targets

def lasso_method(hc, uc, alpha=0.1):
    X = np.concatenate([uc.to_numpy(), hc.to_numpy()], axis=0)
    Y = np.concatenate([np.ones((uc.shape[0],)), np.zeros((hc.shape[0],))], axis=0)
    lasso = Lasso(alpha=alpha)
    lasso.fit(X,Y)
    return np.abs(lasso.coef_)


In [18]:
uc, hc, i2g, g2i = load_dataset(Config(), size=None)

In [37]:
def iou_score(pred, gt=None):
    if gt is None:
        gt = [g2i[i] for i in gt_genes()]
    s_pred = set(pred)
    s_gt = set(gt)
    return len(s_pred & s_gt) / len(s_pred | s_gt)


def rank(pred, gt=None):
    if gt is None:
        gt = [g2i[i] for i in gt_genes()]
    rank = len(pred) - np.argsort(np.argsort(pred))
    return rank[gt]   

def gt_stat():
    uc_stat = uc.describe().iloc[:,[g2i[i] for i in gt_genes()]]
    hc_stat = hc.describe().iloc[:,[g2i[i] for i in gt_genes()]]
    return uc_stat, hc_stat

In [41]:
lasso = lasso_method(hc, uc, alpha=0.1)
mean = baseline_method(hc, uc)

In [42]:
rank(lasso, [g2i[i] for i in gt_genes()])

array([12317, 19221,   735, 15499,  8617, 18898,  7257,  1325,  1878,
         546])

In [43]:
rank(mean, [g2i[i] for i in gt_genes()])

array([22102, 18904, 23091,  8641, 10579,  4930, 14653,  2093,   227,
          18])

In [48]:
len(g2i)

33694

In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class LassoClassification(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.Linear(33694, 1, bias=False)
        
    def forward(self, x):
        " x: [N, 33694] "
        x_norm = (x - x.mean(dim=0, keepdim=True)) / x.std(dim=0, keepdim=True)
        pred = F.sigmoid(self.fc(x_norm))
        return pred.flatten(), self.fc.weight

net = LassoClassification()
