In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

class DataPreprocessor:
    def __init__(self, test_size=0.2, random_state=42):
        self.test_size = test_size
        self.random_state = random_state
        self.user_id_encoder = LabelEncoder()
        self.music_id_encoder = LabelEncoder()
        self.gender_encoder = LabelEncoder()
        self.artist_tfidf_vectorizer = TfidfVectorizer()
        self.genre_tfidf_vectorizer = TfidfVectorizer()
        self.scaler = StandardScaler()
    
    def load_data(self, filepath):
        """
        Load data from a CSV file.
        
        Args:
            filepath (str): Path to the CSV file.
        
        Returns:
            pd.DataFrame: Loaded data.
        """
        data = pd.read_csv(filepath)
        return data
    
    def encode_features(self, data):
        """
        Encode categorical features using LabelEncoder and TF-IDF Vectorizer.
        
        Args:
            data (pd.DataFrame): Data to encode.
        
        Returns:
            pd.DataFrame: Data with encoded features.
        """
        # Label Encoding
        data['user_id_encoded'] = self.user_id_encoder.fit_transform(data['user_id'])
        data['music_id_encoded'] = self.music_id_encoder.fit_transform(data['music_id'])
        data['gender_encoded'] = self.gender_encoder.fit_transform(data['gender'])
        
        # TF-IDF Encoding for 'artist_name' and 'genre'
        artist_tfidf = self.artist_tfidf_vectorizer.fit_transform(data['artist_name'])
        genre_tfidf = self.genre_tfidf_vectorizer.fit_transform(data['genre'])
        
        # Convert TF-IDF matrices to DataFrames
        artist_tfidf_df = pd.DataFrame(artist_tfidf.toarray(), columns=[f'artist_tfidf_{i}' for i in range(artist_tfidf.shape[1])])
        genre_tfidf_df = pd.DataFrame(genre_tfidf.toarray(), columns=[f'genre_tfidf_{i}' for i in range(genre_tfidf.shape[1])])
        
        # Concatenate encoded DataFrames with the original DataFrame
        data_encoded = pd.concat([
            data[['user_id_encoded', 'music_id_encoded', 'age', 'gender_encoded', 'duration', 'acousticness', 
                  'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'instrumentalness', 
                  'liveness', 'valence', 'tempo', 'time_signature', 'explicit', 'plays']],
            artist_tfidf_df,
            genre_tfidf_df
        ], axis=1)
        
        return data_encoded
    
    def feature_engineering(self, data_encoded):
        """
        Perform feature scaling on numerical features.
        
        Args:
            data_encoded (pd.DataFrame): Encoded data.
        
        Returns:
            pd.DataFrame: Data with scaled numerical features.
        """
        numerical_features = ['age', 'duration', 'acousticness', 'danceability', 'energy', 'key', 'loudness', 
                              'mode', 'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                              'time_signature', 'explicit']
        data_encoded[numerical_features] = self.scaler.fit_transform(data_encoded[numerical_features])
        return data_encoded
    
    def split_data(self, data_encoded, target_column='plays'):
        """
        Split data into training and testing sets.
        
        Args:
            data_encoded (pd.DataFrame): Encoded and scaled data.
            target_column (str): Name of the target column.
        
        Returns:
            Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: Train features, test features, train target, test target.
        """
        features = data_encoded.drop(columns=[target_column])
        target = data_encoded[target_column]
        train_features, test_features, train_target, test_target = train_test_split(
            features,
            target,
            test_size=self.test_size,
            random_state=self.random_state
        )
        return train_features, test_features, train_target, test_target
    
    def save_preprocessors(self, directory='models/'):
        """
        Save encoders and vectorizers to disk.
        
        Args:
            directory (str): Directory where the models will be saved.
        """
        with open(f'{directory}user_id_encoder.pkl', 'wb') as f:
            pickle.dump(self.user_id_encoder, f)
        
        with open(f'{directory}music_id_encoder.pkl', 'wb') as f:
            pickle.dump(self.music_id_encoder, f)
        
        with open(f'{directory}gender_encoder.pkl', 'wb') as f:
            pickle.dump(self.gender_encoder, f)
        
        with open(f'{directory}artist_tfidf_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.artist_tfidf_vectorizer, f)
        
        with open(f'{directory}genre_tfidf_vectorizer.pkl', 'wb') as f:
            pickle.dump(self.genre_tfidf_vectorizer, f)
        
        with open(f'{directory}scaler.pkl', 'wb') as f:
            pickle.dump(self.scaler, f)
    
    def load_preprocessors(self, directory='models/'):
        """
        Load encoders and vectorizers from disk.
        
        Args:
            directory (str): Directory where the models are saved.
        """
        with open(f'{directory}user_id_encoder.pkl', 'rb') as f:
            self.user_id_encoder = pickle.load(f)
        
        with open(f'{directory}music_id_encoder.pkl', 'rb') as f:
            self.music_id_encoder = pickle.load(f)
        
        with open(f'{directory}gender_encoder.pkl', 'rb') as f:
            self.gender_encoder = pickle.load(f)
        
        with open(f'{directory}artist_tfidf_vectorizer.pkl', 'rb') as f:
            self.artist_tfidf_vectorizer = pickle.load(f)
        
        with open(f'{directory}genre_tfidf_vectorizer.pkl', 'rb') as f:
            self.genre_tfidf_vectorizer = pickle.load(f)
        
        with open(f'{directory}scaler.pkl', 'rb') as f:
            self.scaler = pickle.load(f)

In [4]:
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F

class ListNetLoss(nn.Module):
    def __init__(self, k=10):
        super(ListNetLoss, self).__init__()
        self.k = k

    def forward(self, y_pred, y_true):
        y_pred = F.softmax(y_pred, dim=1)
        y_true = F.softmax(y_true, dim=1)
        return -torch.sum(y_true * torch.log(y_pred + 1e-10), dim=1).mean()

class HybridRecommender(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, num_features, num_layers, hidden_dims, dropout_prob):
        super(HybridRecommender, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        layers = []
        input_dim = embedding_dim * 2 + num_features
        for i in range(num_layers):
            layers.append(nn.Linear(input_dim, hidden_dims[i]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(p=dropout_prob))
            input_dim = hidden_dims[i]
        layers.append(nn.Linear(input_dim, 1))

        self.fc_layers = nn.Sequential(*layers)

    def forward(self, user_ids, item_ids, features):
        user_embedded = self.user_embedding(user_ids)
        item_embedded = self.item_embedding(item_ids)
        concat_features = torch.cat((user_embedded, item_embedded, features), dim=1)
        output = self.fc_layers(concat_features)
        return output.squeeze()

In [5]:
# train.py
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from preprocessing import DataPreprocessor
from tensorflow_docs.model import HybridRecommender, ListNetLoss
import pickle

def main():
    # Initialize the preprocessor
    preprocessor = DataPreprocessor(test_size=0.2, random_state=42)
    
    # Load and preprocess data
    data_filepath = 'data/raw_data.csv'  # Update with your actual data path
    data = preprocessor.load_data(data_filepath)
    data_encoded = preprocessor.encode_features(data)
    data_encoded = preprocessor.feature_engineering(data_encoded)
    train_features, test_features, train_target, test_target = preprocessor.split_data(data_encoded, target_column='plays')
    
    # Save preprocessors
    preprocessor.save_preprocessors(directory='models/')
    
    # Convert data to PyTorch tensors
    train_data_tensor = torch.tensor(train_features.values, dtype=torch.float)
    test_data_tensor = torch.tensor(test_features.values, dtype=torch.float)
    train_target_tensor = torch.tensor(train_target.values, dtype=torch.float).unsqueeze(1)
    test_target_tensor = torch.tensor(test_target.values, dtype=torch.float).unsqueeze(1)
    
    # Extract user IDs, item IDs, and feature tensors
    train_user_ids = train_data_tensor[:, 0].long()
    train_item_ids = train_data_tensor[:, 1].long()
    train_features = train_data_tensor[:, 2:]
    
    test_user_ids = test_data_tensor[:, 0].long()
    test_item_ids = test_data_tensor[:, 1].long()
    test_features = test_data_tensor[:, 2:]
    
    # Initialize the model
    num_users = len(preprocessor.user_id_encoder.classes_)
    num_items = len(preprocessor.music_id_encoder.classes_)
    embedding_dim = 128
    num_layers = 3
    hidden_dims = [256, 128, 64]
    num_features = train_features.shape[1]
    dropout_prob = 0.2
    
    model = HybridRecommender(num_users, num_items, embedding_dim, num_features, num_layers, hidden_dims, dropout_prob)
    
    # Define optimizer and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
    
    # Define loss function
    criterion = ListNetLoss(k=10)
    
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    print(f"Training on device: {device}")
    
    # Create DataLoader for batch processing
    dataset = TensorDataset(train_user_ids, train_item_ids, train_features, train_target_tensor)
    dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
    
    # Training loop
    num_epochs = 30
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        for batch_user_ids, batch_item_ids, batch_features, batch_target in dataloader:
            batch_user_ids = batch_user_ids.to(device)
            batch_item_ids = batch_item_ids.to(device)
            batch_features = batch_features.to(device)
            batch_target = batch_target.to(device)
            
            optimizer.zero_grad()
            predictions = model(batch_user_ids, batch_item_ids, batch_features)
            loss = criterion(predictions.unsqueeze(1), batch_target)
            
            # Add L2 regularization
            l2_reg = torch.tensor(0.).to(device)
            for param in model.parameters():
                l2_reg += torch.norm(param)
            loss += 1e-4 * l2_reg
            
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        scheduler.step()
        average_loss = epoch_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}")
    
    # Evaluation with NDCG@10
    model.eval()
    with torch.no_grad():
        predictions = model(test_user_ids.to(device), test_item_ids.to(device), test_features.to(device))
        ndcg_score = ndcg_at_k(test_target_tensor.cpu().numpy(), predictions.cpu().numpy(), k=10)
        print(f"Test NDCG@10: {ndcg_score:.4f}")
    
    # Save the trained model
    torch.save(model.state_dict(), 'models/model.pth')
    print("Model saved to 'models/model.pth'")

def ndcg_at_k(y_true, y_pred, k):
    """
    Calculate the Normalized Discounted Cumulative Gain (NDCG) at k.
    
    Args:
        y_true (np.array): Ground truth scores.
        y_pred (np.array): Predicted scores.
        k (int): Rank cutoff.
        
    Returns:
        float: NDCG score.
    """
    y_true = y_true.reshape(-1)
    y_pred = y_pred.reshape(-1)
    
    indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[indices]
    
    gains = np.log2(y_true_sorted + 1)
    discounts = np.log2(np.arange(len(y_true_sorted)) + 2)
    dcg = np.sum(gains[:k] / discounts[:k])
    
    ideal_gains = np.log2(np.sort(y_true)[::-1] + 1)
    ideal_dcg = np.sum(ideal_gains[:k] / discounts[:k])
    
    return dcg / ideal_dcg if ideal_dcg != 0 else 0.0

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'preprocessing'

In [6]:
# inference.py
import torch
import pandas as pd
from preprocessing import DataPreprocessor
from tensorflow_docs.model import HybridRecommender
import pickle

def make_inference(model, user_id, item_id, features, user_encoder, item_encoder, device):
    """
    Perform inference to predict plays for a specific user-item pair.
    
    Args:
        model (nn.Module): Trained recommender model.
        user_id (str): The user ID.
        item_id (str): The item (music) ID.
        features (pd.Series): Feature vector for the user-item pair.
        user_encoder (LabelEncoder): Fitted LabelEncoder for user IDs.
        item_encoder (LabelEncoder): Fitted LabelEncoder for item IDs.
        device (torch.device): Device where the model is loaded.
    
    Returns:
        float: Predicted number of plays.
    """
    model.eval()  # Set the model to evaluation mode
    user_id_encoded = torch.tensor(user_encoder.transform([user_id]), dtype=torch.long).to(device)
    item_id_encoded = torch.tensor(item_encoder.transform([item_id]), dtype=torch.long).to(device)
    features_tensor = torch.tensor(features, dtype=torch.float).unsqueeze(0).to(device)

    with torch.no_grad():
        prediction = model(user_id_encoded, item_id_encoded, features_tensor)
    return prediction.cpu().numpy()[0]

def get_recommendations(model, user_id, data_encoded, user_id_encoder, item_encoder, device, top_k=10):
    """
    Generate top-k recommendations for a given user.

    Args:
        model (nn.Module): Trained recommender model.
        user_id (str): The user ID for whom to generate recommendations.
        data_encoded (pd.DataFrame): The preprocessed and encoded dataset.
        user_id_encoder (LabelEncoder): Fitted LabelEncoder for user IDs.
        item_encoder (LabelEncoder): Fitted LabelEncoder for item IDs.
        device (torch.device): Device where the model is loaded.
        top_k (int): Number of top recommendations to return.

    Returns:
        List[str]: List of recommended music IDs.
    """
    model.eval()  # Set model to evaluation mode

    # Encode the user ID
    try:
        user_id_encoded = user_id_encoder.transform([user_id])[0]
    except ValueError:
        raise ValueError(f"User ID {user_id} not found in encoder.")

    # Extract user features
    user_data = data_encoded[data_encoded['user_id_encoded'] == user_id_encoded].drop(columns=['user_id_encoded', 'plays']).iloc[0]
    user_features = torch.tensor(user_data.values, dtype=torch.float).unsqueeze(0).to(device)

    # Prepare all item IDs
    item_ids = torch.arange(len(item_encoder.classes_)).to(device)

    # Create user and item tensors
    user_ids = torch.tensor([user_id_encoded] * len(item_ids), dtype=torch.long).to(device)
    item_ids = item_ids.long()

    # Disable gradient computation
    with torch.no_grad():
        # Forward pass to get scores
        scores = model(user_ids, item_ids, user_features.repeat(len(item_ids), 1))

    # Get top-k scores and corresponding item IDs
    top_scores, top_indices = torch.topk(scores, top_k)
    top_item_ids_encoded = top_indices.cpu().numpy()

    # Convert encoded item IDs back to original IDs
    top_item_ids = item_encoder.inverse_transform(top_item_ids_encoded)

    return top_item_ids.tolist()

ModuleNotFoundError: No module named 'preprocessing'