In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting requests>=2.19.0 (from datasets)
  Using cached requests-2.31.0-py3-none-any.whl (62 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m

In [3]:
import sys
import os
import pandas as pd
import numpy as np
sys.path.append('/root/barcode/')
sys.path.append('../')
from BarcodeScanner import tree_and_clustering, base_barcode
from itertools import product, combinations
from sklearn.linear_model import LinearRegression
import timeit
from datasets import Dataset

def gen_X(num_var: int, sample_size : int):
    data_dictionary = {}
    for i in range(num_var):
        var_name = "x" + f"{i + 1}"
        data_dictionary[var_name] = list(np.random.binomial(1, .5, sample_size))
    return pd.DataFrame(data_dictionary)

def gen_full_X(num_var: int, sample_size :int):
    raw_X = gen_X(num_var = num_var, sample_size = sample_size)
    colnames = raw_X.columns
    for k in range(2, len(colnames)+ 1):
        interaction_generator = combinations(colnames, k)
        for interaction_tuple in interaction_generator:
            new_colname = "*".join(interaction_tuple)
            raw_X[new_colname] = raw_X[list(interaction_tuple)].apply(np.prod, axis = 1)
    return raw_X

def gen_barcode_dataloader(num_var:int, sample_size:int):
    raw_X = gen_X(num_var = num_var, sample_size = sample_size)
    colnames = [
        f"x{i+1}" for i in range(num_var)
    ]
    from datasets import Dataset

    dataset = Dataset.from_pandas(raw_X)
    def gen_z(examples):
        example_list = [examples[x] for x in colnames]
        df = pd.DataFrame(zip(*example_list), columns= colnames)
        barcodes = base_barcode.gen_barcode(df).reshape(-1).tolist()
        y = df.apply(lambda seq: 1 + seq.x1 + seq.x2 + seq.x1*seq.x3 + np.random.normal(), axis = 1).tolist()
        return {"z": barcodes, "y":y}
    dataset = dataset.map(gen_z, batched = True, remove_columns=colnames)
    return dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from itertools import product

def L(p):
    all_sets = list(set(product([0,1], repeat = p))); all_sets.sort()
    return np.array([base_barcode.barcode_to_beta(x) for x in all_sets]).astype(np.int8)


### Experiment 1: Memory consumption on entire pipeline

#### linear regression

In [86]:
p, n = 5, 1_000_000

In [87]:
X =gen_full_X(p, n)

In [88]:
X['y'] = np.random.normal(size = n)

In [89]:
X.to_csv('sample_dataset.csv', index = False)

In [90]:
file_size = os.path.getsize('sample_dataset.csv')
file_size/1024/1024

77.84959983825684

In [95]:
pd.read_csv('sample_dataset.csv'
).__sizeof__()/1024/1024

244.1407470703125

In [7]:
y = np.random.normal(size = n)

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
reg = LinearRegression()

In [10]:
reg.fit(X,y)

#### neural network

In [11]:
p, n = 5, 5000

In [12]:
input_dataset = gen_barcode_dataloader(p, n)

Map: 100%|██████████| 5000/5000 [00:00<00:00, 24446.69 examples/s]


In [13]:
input_dataset = input_dataset.train_test_split(test_size = .2)

In [14]:
from torch.utils.data import Dataset, DataLoader

train_dataloader = DataLoader(input_dataset['train'], batch_size=126, shuffle=True)
test_dataloader = DataLoader(input_dataset['test'], batch_size=126, shuffle=True)

In [15]:
import torch
from torch import nn

class Lasso_Barcode(nn.Module):
    def __init__(self, num_variable):
        super().__init__()
        barcode_size = 2**num_variable
        embedding_weights = torch.from_numpy(L(num_variable).astype(float))
        self.embedding = nn.Embedding(barcode_size, barcode_size)
        self.embedding.weight = nn.Parameter(embedding_weights.to(torch.float32), requires_grad = False)
        self.linear = nn.Linear(barcode_size, 1, bias = False, dtype = torch.float32)

    def l1_reg(self):
        return torch.abs(self.linear.weight).sum()

    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        return x, self.l1_reg()

In [16]:
lasso = Lasso_Barcode(p)

In [17]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(lasso.parameters(), lr=0.001)


In [18]:
alpha = 0.3

In [30]:
num_epochs = 10
val_loss = np.inf
for epoch in range(num_epochs):
    # Forward pass
    lasso.train()
    for batch in train_dataloader:
        input_tensor = batch['z']
        output_tensor = batch['y'].to(torch.float32)
        output_tensor = torch.reshape(output_tensor, (-1, ))
#         optimizer.zero_grad()
        assert input_tensor.size() == output_tensor.size()
    

        outputs, l1_reg = lasso(input_tensor)
        # loss = criterion(outputs, output_feature)
        
        loss = criterion(outputs, output_tensor) + alpha * l1_reg  # Total loss with L1 regularization
        loss.backward()
        optimizer.step()
        
    if epoch % 10 == 0:
        lasso.eval()
        losses = []
        for batch in test_dataloader:
            input_tensor = batch['z']
            output_tensor = batch['y'].to(torch.float32)
            output_tensor = torch.reshape(output_tensor, (-1, ))
            assert input_tensor.size() == output_tensor.size()
            outputs, l1_reg = lasso(input_tensor)
            loss = criterion(outputs, output_tensor) + alpha * l1_reg  # Total loss with L1 regularization
            losses.append(loss.item())
        current_val_loss = np.mean(losses)
        if val_loss > current_val_loss:
            val_loss = current_val_loss
        else:
            break
        


In [31]:
lasso.linear.weight

Parameter containing:
tensor([[ 1.1231,  0.6960,  0.7544,  0.6514,  0.7150,  0.7488, -0.3390, -0.3929,
         -0.3170, -0.4057, -0.2643,  0.0650, -0.4015, -0.1308, -0.2917,  0.0311,
          0.1803,  0.1104, -0.1903,  0.1278, -0.0584, -0.0703, -0.2474,  0.0523,
         -0.2329, -0.1193,  0.0064, -0.0453,  0.0490, -0.1698, -0.1807,  0.0346]],
       requires_grad=True)

This time, please use the "from memory_profiler import profile" and profile decorator and run the python script and manually check the result


In [85]:
!python lasso_memory_simulation_file.py -p 5 -n 1_000_000

Map (num_proc=10): 100%|███| 1000000/1000000 [00:07<00:00, 134004.44 examples/s]
100%|███████████████████████████████████████████| 10/10 [04:32<00:00, 27.29s/it]
Filename: /root/barcode/chapter_2_simulation/lasso_memory_simulation_file.py

Line #    Mem usage    Increment  Occurrences   Line Contents
   123    627.9 MiB    627.9 MiB           1   @profile
   124                                         def pipeline(p, n, input_dataset):
   125    627.9 MiB      0.0 MiB           1       device = torch.device('cuda')
   126    647.5 MiB     19.6 MiB           1       input_dataset = input_dataset.train_test_split(test_size = .2)
   127                                         
   128    647.5 MiB      0.0 MiB           1       train_dataloader = DataLoader(input_dataset['train'], batch_size=2**13, shuffle=True)
   129    647.5 MiB      0.0 MiB           1       test_dataloader = DataLoader(input_dataset['test'], batch_size=2**14, shuffle=True)
   130                                       

In [96]:
!python linear_model_simulation_file.py -p 5 -n 1_000_000

Filename: /root/barcode/chapter_2_simulation/linear_model_simulation_file.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    46    247.0 MiB    247.0 MiB           1   @profile
    47                                         def pipeline(p, n):
    48    715.4 MiB    468.4 MiB           1       df = pd.read_csv('sample_dataset.csv')
    49    952.3 MiB    236.9 MiB           1       X = df.loc[:, df.columns.str.contains('x')].to_numpy()
    50    952.3 MiB      0.0 MiB           1       y = df.y.to_numpy().reshape(-1,1)
    51    952.3 MiB      0.0 MiB           1       reg = LinearRegression()
    52    967.1 MiB     14.8 MiB           1       reg.fit(X, y)




### Experiment 2: Speed

Choose the proper early stopping criteria for neural network