In [1]:
import sys
import os
import pandas as pd
import numpy as np
# sys.path.append('/root/barcode/')
sys.path.append('../')
from BarcodeScanner import tree_and_clustering, base_barcode
from itertools import product, combinations
from sklearn.linear_model import LinearRegression
import timeit
from datasets import Dataset

def gen_X(num_var: int, sample_size : int):
    data_dictionary = {}
    for i in range(num_var):
        var_name = "x" + f"{i + 1}"
        data_dictionary[var_name] = list(np.random.binomial(1, .5, sample_size))
    return pd.DataFrame(data_dictionary)

def gen_full_X(num_var: int, sample_size :int):
    raw_X = gen_X(num_var = num_var, sample_size = sample_size)
    colnames = raw_X.columns
    for k in range(2, len(colnames)+ 1):
        interaction_generator = combinations(colnames, k)
        for interaction_tuple in interaction_generator:
            new_colname = "*".join(interaction_tuple)
            raw_X[new_colname] = raw_X[list(interaction_tuple)].apply(np.prod, axis = 1)
    return raw_X

def gen_barcode_dataloader(num_var:int, sample_size:int):
    raw_X = gen_X(num_var = num_var, sample_size = sample_size)
    colnames = [
        f"x{i+1}" for i in range(num_var)
    ]
    from datasets import Dataset

    dataset = Dataset.from_pandas(raw_X)
    def gen_z(examples):
        example_list = [examples[x] for x in colnames]
        df = pd.DataFrame(zip(*example_list), columns= colnames)
        barcodes = base_barcode.gen_barcode(df).reshape(-1).tolist()
        y = df.apply(lambda seq: 1 + seq.x1 + seq.x2 + seq.x1*seq.x3 + np.random.normal(), axis = 1).tolist()
        return {"z": barcodes, "y":y}
    dataset = dataset.map(gen_z, batched = True, remove_columns=colnames)
    return dataset

In [2]:
from itertools import product

def L(p):
    all_sets = list(set(product([0,1], repeat = p))); all_sets.sort()
    return np.array([base_barcode.barcode_to_beta(x) for x in all_sets]).astype(np.int8)


### Experiment 1: Memory consumption on entire pipeline

#### linear regression

In [4]:
p, n = 5, 5000

In [5]:
X =gen_full_X(p, n)

In [6]:
y = np.random.normal(size = n)

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
reg = LinearRegression()

In [9]:
reg.fit(X,y)

#### neural network

In [3]:
p, n = 5, 5000

In [4]:
input_dataset = gen_barcode_dataloader(p, n)

  0%|          | 0/5 [00:00<?, ?ba/s]

In [5]:
input_dataset = input_dataset.train_test_split(test_size = .2)

In [6]:
from torch.utils.data import Dataset, DataLoader

train_dataloader = DataLoader(input_dataset['train'], batch_size=126, shuffle=True)
test_dataloader = DataLoader(input_dataset['test'], batch_size=126, shuffle=True)

In [7]:
import torch
from torch import nn

class Lasso_Barcode(nn.Module):
    def __init__(self, num_variable):
        super().__init__()
        barcode_size = 2**num_variable
        embedding_weights = torch.from_numpy(L(num_variable).astype(float))
        self.embedding = nn.Embedding(barcode_size, barcode_size)
        self.embedding.weight = nn.Parameter(embedding_weights.to(torch.float32), requires_grad = False)
        self.linear = nn.Linear(barcode_size, 1, bias = False, dtype = torch.float32)

    def l1_reg(self):
        return torch.abs(self.linear.weight).sum()

    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        return x, self.l1_reg()

In [8]:
lasso = Lasso_Barcode(p)

In [9]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(lasso.parameters(), lr=0.001)


In [10]:
alpha = 0.3

In [21]:
num_epochs = 10
val_loss = np.inf
for epoch in range(num_epochs):
    # Forward pass
    lasso.train()
    for batch in train_dataloader:
        input_tensor = batch['z']
        output_tensor = batch['y'].to(torch.float32)
#         optimizer.zero_grad()
        assert input_tensor.size() == output_tensor.size()
    

        outputs, l1_reg = lasso(input_tensor)
        # loss = criterion(outputs, output_feature)
        
        loss = criterion(outputs, output_tensor) + alpha * l1_reg  # Total loss with L1 regularization
        loss.backward()
        optimizer.step()
        
    if epoch % 10 == 0:
        lasso.eval()
        losses = []
        for batch in test_dataloader:
            input_tensor = batch['z']
            output_tensor = batch['y'].to(torch.float32)
            assert input_tensor.size() == output_tensor.size()
            outputs, l1_reg = lasso(input_tensor)
            loss = criterion(outputs, output_tensor) + alpha * l1_reg  # Total loss with L1 regularization
            losses.append(loss.item())
        current_val_loss = np.mean(losses)
        if val_loss > current_val_loss:
            val_loss = current_val_loss
        else:
            break
        


In [22]:
lasso.linear.weight

Parameter containing:
tensor([[ 2.0701e+00, -1.8711e-01,  2.9036e-01,  1.8416e-01,  1.8360e-01,
          3.2203e-01,  5.1909e-02, -5.3827e-02,  2.6595e-01, -7.7950e-02,
         -3.6562e-02,  2.6297e-01, -1.9794e-03, -9.1933e-02, -4.0277e-01,
          4.0053e-01,  1.5431e-01, -1.6806e-01, -1.5014e-01, -1.3885e-01,
          1.1912e-01,  4.4234e-01, -1.5886e-01,  4.8904e-01, -3.0276e-01,
         -2.8123e-03,  8.0175e-02, -9.3240e-02, -3.1572e-01, -1.1820e-01,
         -1.2660e-01,  3.0994e-01]], requires_grad=True)

This time, please use the "from memory_profiler import profile" and profile decorator and run the python script and manually check the result


### Experiment 2: Speed

Choose the proper early stopping criteria for neural network