In [2]:
import numpy as np
import pandas as pd

In [9]:
from google.cloud import storage
import os

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)

    print(f"Downloaded {source_blob_name} to {destination_file_name}.")

# Example usage
bucket_name = "leo_melon_simple"
blobs_to_download = [
    "exp0901/data/sample100/user_song_dt_000000000000.csv",
    "exp0901/data/sample100/user_song_dt_000000000001.csv",
    "exp0901/data/sample100/user_song_dt_000000000002.csv",
    "exp0901/data/sample100/user_song_dt_000000000003.csv",
    "exp0901/data/sample100/user_song_dt_000000000004.csv",
    "exp0901/data/sample100/user_song_dt_000000000005.csv",
    "exp0901/data/sample100/user_song_dt_000000000006.csv",# Replace with your actual file paths
    "exp0901/data/sample100/song_normalized_data_000000000000.csv"
]
destination_folder = "../data/exp0901/sample100/raw"  # Replace with the local folder path

# Ensure the destination folder exists
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

for blob_name in blobs_to_download:
    # Create the full local path for each file
    destination_file_path = os.path.join(destination_folder, os.path.basename(blob_name))
    # Download the file from GCS
    download_blob(bucket_name, blob_name, destination_file_path)


Downloaded exp0901/data/sample100/user_song_dt_000000000000.csv to ../data/exp0901/sample100/raw/user_song_dt_000000000000.csv.
Downloaded exp0901/data/sample100/user_song_dt_000000000001.csv to ../data/exp0901/sample100/raw/user_song_dt_000000000001.csv.
Downloaded exp0901/data/sample100/user_song_dt_000000000002.csv to ../data/exp0901/sample100/raw/user_song_dt_000000000002.csv.
Downloaded exp0901/data/sample100/user_song_dt_000000000003.csv to ../data/exp0901/sample100/raw/user_song_dt_000000000003.csv.
Downloaded exp0901/data/sample100/user_song_dt_000000000004.csv to ../data/exp0901/sample100/raw/user_song_dt_000000000004.csv.
Downloaded exp0901/data/sample100/user_song_dt_000000000005.csv to ../data/exp0901/sample100/raw/user_song_dt_000000000005.csv.
Downloaded exp0901/data/sample100/user_song_dt_000000000006.csv to ../data/exp0901/sample100/raw/user_song_dt_000000000006.csv.
Downloaded exp0901/data/sample100/song_normalized_data_000000000000.csv to ../data/exp0901/sample100/raw

In [7]:
import pandas as pd

fd = "../data/exp0901/sample100/raw"

In [10]:
files = [f"{fd}/{f}" for f in os.listdir(fd)]

In [14]:
usd_df = pd.concat([pd.read_csv(f, header=0) for f in files if "user_song_dt" in f])

In [22]:
np.unique(usd_df.user_id).shape

(40027,)

In [17]:
snd_df = pd.read_csv(f"{fd}/song_normalized_data_000000000000.csv")

In [20]:
snd_df.normalized_prev_number_of_plays

0         4.997058e-04
1         2.785971e-04
2         3.494983e-04
3         3.385541e-05
4         4.209286e-05
              ...     
338750    6.377706e-07
338751    1.354216e-06
338752    9.308733e-06
338753    1.728765e-05
338754    1.913312e-06
Name: normalized_prev_number_of_plays, Length: 338755, dtype: float64

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class UserLogisticRegression(nn.Module):
    def __init__(self, num_users):
        super(UserLogisticRegression, self).__init__()
        # Embedding layer to store coefficients for each user_id
        self.user_coefficients = nn.Embedding(num_users, 2)
        # Fixed constant
        self.fixed_constant = 0.5

    def forward(self, user_id, x):
        """
        user_id: Tensor of shape (batch_size,) with user IDs
        x: Tensor of shape (batch_size, 2) with input features
        """
        # Get the coefficients for the given user_ids
        user_coeffs = self.user_coefficients(user_id)
        
        # Compute the logit as dot product of user-specific coefficients and input features
        logits = torch.sum(user_coeffs * x, dim=1) + self.fixed_constant
        
        return logits

# Example usage
num_users = 1000  # Example: 1000 unique user_ids
model = UserLogisticRegression(num_users)

# Example input data
user_ids = torch.tensor([0, 1, 2])  # Example user IDs
features = torch.tensor([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]])  # Example feature vectors

# Forward pass
logits = model(user_ids, features)
print(logits)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class UserSongDataset(Dataset):
    def __init__(self, user_song_data, all_songs, num_negatives=1):
        """
        user_song_data: List of tuples (user_id, song_id, x1, x2, y) where y = 1 for positive samples
        all_songs: List or set of all possible song_ids
        num_negatives: Number of negative samples to generate per positive sample
        """
        self.user_song_data = user_song_data
        self.all_songs = list(all_songs)
        self.num_negatives = num_negatives
        
        # Create a dictionary of user_id to set of song_ids for fast lookup
        self.user_song_dict = {}
        for user_id, song_id, _, _, _ in user_song_data:
            if user_id not in self.user_song_dict:
                self.user_song_dict[user_id] = set()
            self.user_song_dict[user_id].add(song_id)
        
        # Precompute the dataset including negative samples
        self.dataset = self._generate_dataset()

    def _generate_dataset(self):
        data = []
        for user_id, song_id, x1, x2, y in self.user_song_data:
            # Add the positive sample
            data.append((user_id, song_id, x1, x2, y))
            
            # Generate negative samples
            for _ in range(self.num_negatives):
                negative_song_id = np.random.choice(self.all_songs)
                while negative_song_id in self.user_song_dict[user_id]:
                    negative_song_id = np.random.choice(self.all_songs)
                data.append((user_id, negative_song_id, x1, x2, 0))  # y=0 for negative sample
        
        return data

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        user_id, song_id, x1, x2, y = self.dataset[idx]
        return torch.tensor(user_id), torch.tensor(song_id), torch.tensor([x1, x2], dtype=torch.float32), torch.tensor(y)

# Example usage
# Suppose we have the following positive data
user_song_data = [
    (0, 101, 0.5, 0.3, 1),
    (0, 102, 0.6, 0.4, 1),
    (1, 101, 0.7, 0.2, 1)
]

# And these are all possible song_ids
all_songs = {101, 102, 103, 104}

# Create the dataset with 2 negative samples per positive sample
dataset = UserSongDataset(user_song_data, all_songs, num_negatives=2)

# Create the DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Iterate over the DataLoader
for batch in dataloader:
    user_ids, song_ids, features, labels = batch
    print("User IDs:", user_ids)
    print("Song IDs:", song_ids)
    print("Features:", features)
    print("Labels:", labels)
    print()
