In [291]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, r2_score, ndcg_score, roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime as dt
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import random
import numpy as np
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')

In [292]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/BT4301/Dataset/old dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/10UxIGmI1nRKsdfd_xARinm-FmI8_NeVG/BT4301/Dataset/old dataset


In [293]:
class KindleDataset(Dataset):
    def __init__(self, customer_file='customer_clean.xlsx', product_file='product_clean.xlsx', seq_length=10, mode='train'):
        self.seq_length = seq_length
        self.mode = mode

        self.data, self.label_encoders, self.title_tfidf = self.load_and_preprocess_data(customer_file, product_file)

        self.user_sequences, self.targets = self.create_sequences_with_ratings()

        if mode == 'eval':
            self.user_ids = self.data['user_id'].unique()
            self.user_sequences = self.user_sequences[[i for i, seq in enumerate(self.user_sequences) if len(seq) >= 2]]
            self.targets = self.targets[[i for i, seq in enumerate(self.user_sequences) if len(seq) >= 2]]
        else:
            self.user_ids = self.data['user_id'].unique()

        self.num_books = len(self.label_encoders['parent_asin'].classes_)
        self.num_categories = len(self.label_encoders['categories'].classes_)

    def load_and_preprocess_data(self, customer_file, product_file, min_purchases=5):
        review_data = pd.read_excel(customer_file)
        review_data = review_data.drop(columns=['title', 'verified_purchase', 'text', 'asin', 'helpful_vote'])
        metadata = pd.read_excel(product_file)
        metadata = metadata.drop(columns=['subtitle', 'language'])
        self.asin_to_category = dict(zip(review_data['parent_asin'], metadata['categories']))
        data = pd.merge(review_data, metadata, on=['parent_asin'])

        # Filter users based on min_purchases
        user_counts = data.groupby('user_id')['parent_asin'].nunique()
        valid_users = user_counts[user_counts >= min_purchases].index
        data = data[data['user_id'].isin(valid_users)]

        # Preprocessing steps
        data['rating'] = data['rating'].astype(float)
        data['price'] = pd.to_numeric(data['price'], errors='coerce')
        data['price'] = data['price'].fillna(0)
        data['price'] = data['price'].astype(float)
        data['print_length'] = data['print_length'].fillna(0)
        data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')
        data['publication_date'] = pd.to_datetime(data['publication_date'], errors='coerce')
        data['days_since_publication'] = (dt.now() - data['publication_date']).dt.days
        data['days_since_publication'].fillna(data['days_since_publication'].mean(), inplace=True)

        # Encode categorical variables
        label_encoders = {}
        for col in ['user_id', 'parent_asin', 'categories', 'author']:
            label_encoders[col] = LabelEncoder()
            data[col] = label_encoders[col].fit_transform(data[col].fillna('Unknown'))

        # Normalize numerical features
        self.scaler = MinMaxScaler()
        numeric_cols = ['rating', 'average_rating', 'rating_number', 'price', 'days_since_publication', 'print_length']
        data[numeric_cols] = self.scaler.fit_transform(data[numeric_cols])

        # Apply TF-IDF on text data
        data['title'] = data['title'].fillna('Unknown')
        vectorizer_title = TfidfVectorizer(max_features=50)
        title_tfidf = vectorizer_title.fit_transform(data['title']).toarray()

        return data, label_encoders, title_tfidf

    def create_sequences_with_ratings(self):
        sequences = []
        targets = []
        numeric_cols = [column for column in self.data.columns.tolist()
                        if self.data[column].dtype in ('int64', 'float64')
                        and column not in ['rating', 'user_id', 'parent_asin']]
        groups = self.data.sort_values('timestamp').groupby('user_id')
        seq_length = self.seq_length

        for user_id, group in groups:
            if len(group) >= self.seq_length:
                # Reset index to ensure it starts from 0
                group = group.reset_index(drop=True)
                for i in range(len(group) - seq_length + 1):
                    numeric_seq = group[numeric_cols].iloc[i:i + seq_length].values
                    title_seq = self.title_tfidf[group.index[i:i + seq_length]]

                    # Ensure all categorical features are converted to numeric
                    parent_asin_seq = group['parent_asin'].iloc[i:i + seq_length].values.reshape(-1, 1).astype(np.float32)
                    categories_seq = group['categories'].iloc[i:i + seq_length].values.reshape(-1, 1).astype(np.float32)
                    author_seq = group['author'].iloc[i:i + seq_length].values.reshape(-1, 1).astype(np.float32)

                    seq = np.concatenate([numeric_seq, parent_asin_seq, categories_seq,
                                          author_seq, title_seq], axis=1)
                    sequences.append(seq)

                    # Target is the rating for the current book
                    target_val = group['rating'].iloc[i + seq_length - 1]
                    targets.append(target_val)

        return torch.tensor(np.array(sequences), dtype=torch.float32), torch.tensor(np.array(targets), dtype=torch.float32)

    def __len__(self):
        if self.mode == 'train':
            return len(self.user_sequences)
        else:
            return len(self.user_ids)

    def __getitem__(self, idx):
        if self.mode == 'train':
            user_sequence = self.user_sequences[idx]
            target = self.targets[idx]
        else:
            user_sequence = self.user_sequences[idx]
            target = self.targets[idx]

        # Assign values to input_seq and labels
        input_seq = user_sequence
        labels = target

        # Convert to PyTorch tensors
        input_seq = torch.tensor(input_seq, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)

        return input_seq, labels

In [294]:
class LSTMRecommender(nn.Module):
    def __init__(self, num_books, embedding_dim, hidden_size, num_layers, input_size):
        super(LSTMRecommender, self).__init__()
        self.book_embedding = nn.Embedding(num_books, embedding_dim)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_books)

    def forward(self, x):
        num_features = x[:, :, :6]
        parent_asin = x[:, :, 6].long()
        categories = x[:, :, 7].long()
        author = x[:, :, 8].long()
        title_tfidf = x[:, :, 9:]

        # Get embeddings for categorical features
        parent_asin_emb = self.book_embedding(parent_asin)

        # Concatenate all features
        all_features = torch.cat([num_features, parent_asin_emb, title_tfidf,
                                  categories.unsqueeze(-1).float(), author.unsqueeze(-1).float()], dim=2)

        input_size = all_features.shape[-1]
        self.lstm = nn.LSTM(input_size, self.lstm.hidden_size, self.lstm.num_layers, batch_first=True)
        lstm_out, _ = self.lstm(all_features)

        # Get output for the last time step and predict for all books
        output = self.fc(lstm_out[:, -1, :])

        # Reshape output if it's squeezed to a single value
        if output.dim() == 1:
            output = output.unsqueeze(0)

        # If the model predicts the same rating for all books, `topk` will fail
        output = output + torch.randn_like(output) * 1e-6

        # Apply sigmoid activation to ensure output is within a valid range
        output = torch.sigmoid(output)

        return output

In [295]:
def train_model(model, train_loader, num_epochs=5, learning_rate=0.001, batch_size=64):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)

            # Get predicted rating for the target book
            target_book_indices = batch_X[:, -1, 6].long()  # Get parent_asin from input
            predicted_ratings = outputs.gather(1, target_book_indices.unsqueeze(1)).squeeze(1)

            loss = criterion(predicted_ratings, batch_y)  # Calculate loss for target book only
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        avg_loss = total_loss / num_batches
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

In [296]:
def predict(model, input_sequence, book_encoder, top_k=5):
    model.eval()
    with torch.no_grad():

        input_sequence = torch.tensor(input_sequence, dtype=torch.float32).unsqueeze(0) #
        predicted_ratings = model(input_sequence)
        _, top_k_indices = torch.topk(predicted_ratings.squeeze(), top_k)
        recommended_asins = book_encoder.inverse_transform(top_k_indices.cpu().numpy())

    return list(map(str, recommended_asins))  # Return the recommended ASINs as strings

In [297]:
def evaluate_model(model, eval_loader, book_encoder, top_k=5):
    model.eval()
    hits = 0
    total = 0
    ndcg_sum = 0

    with torch.no_grad():
        for input_sequence, target_rating in eval_loader:
            predicted_ratings = model(input_sequence)
            _, top_k_indices = torch.topk(predicted_ratings.squeeze(), top_k)

            # Iterate over each sample in the batch
            for i in range(input_sequence.shape[0]):
                target_book_id = input_sequence[i, -1, -4].long().item()

                if target_book_id in top_k_indices.cpu().numpy():
                    hits += 1
                    rank = np.where(top_k_indices.cpu().numpy() == target_book_id)[0][0] + 1
                    ndcg_sum += 1 / np.log2(rank + 1)

            total += input_sequence.shape[0]

    # Calculate overall metrics
    hit_rate = hits / total if total > 0 else 0
    ndcg = ndcg_sum / total if total > 0 else 0

    print(f"Evaluation Results!")
    print(f"Hits: {hits}/{total}")
    print(f"Hit Rate (Top {top_k}): {hit_rate:.4f}")
    print(f"NDCG (Top {top_k}): {ndcg:.4f}")

    return hit_rate, ndcg

In [298]:
def main():

    # Create train and eval datasets
    train_dataset = KindleDataset(mode='train')
    eval_dataset = KindleDataset(mode='eval')

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=64, shuffle=False)

    # Get necessary information from the dataset
    num_books = train_dataset.num_books
    num_categories = train_dataset.num_categories
    input_size = train_dataset.data.shape[1]
    embedding_dim = 64
    hidden_size = 64
    num_layers = 2
    num_epochs = 10
    learning_rate = 0.01
    batch_size = 64

    # Instantiate the LSTMRecommender model
    model = LSTMRecommender(num_books, embedding_dim, hidden_size, num_layers, input_size)

    # Train the model
    print("Training the model...")
    train_model(model, train_loader)  # Use train_loader for training

    # Evaluate the model
    print("\nEvaluating the model...")
    evaluate_model(model, eval_loader, train_dataset.label_encoders['parent_asin'], top_k=10)

    # Save the model and components
    torch.save(model.state_dict(), 'lstm_model.pth')
    for encoder_name, encoder in train_dataset.label_encoders.items():
        joblib.dump(encoder, f'{encoder_name}_encoder.pkl')
    joblib.dump(train_dataset.title_tfidf, 'title_tfidf_vectorizer.pkl')
    joblib.dump(train_dataset.scaler, 'scaler.pkl')
    with open('asin_to_category.pkl', 'wb') as f:
        pickle.dump(train_dataset.asin_to_category, f)
    print("Model and components saved successfully!")

In [299]:
if __name__ == "__main__":
    main()

Training the model...
Epoch [1/5], Loss: 0.1491
Epoch [2/5], Loss: 0.1551
Epoch [3/5], Loss: 0.1560
Epoch [4/5], Loss: 0.1562
Epoch [5/5], Loss: 0.1544

Evaluating the model...
Evaluation Results!
Hits: 0/32
Hit Rate (Top 10): 0.0000
NDCG (Top 10): 0.0000
Model and components saved successfully!


In [300]:
# Load the trained model
train_dataset = KindleDataset(mode='train')

num_books = train_dataset.num_books
num_categories = train_dataset.num_categories
input_size = train_dataset.data.shape[1]
embedding_dim = 64
hidden_size = 64
num_layers = 2
num_epochs = 10
learning_rate = 0.01
batch_size = 64

# Instantiate the LSTMRecommender model
model = LSTMRecommender(num_books, embedding_dim, hidden_size, num_layers, input_size)

model.eval()  # Set the model to evaluation mode

# Load the book encoder
book_encoder = joblib.load('parent_asin_encoder.pkl')

In [301]:
eval_dataset = KindleDataset(mode='eval')
sample_input, _ = eval_dataset[0]

In [302]:
recommendations = predict(model, sample_input, book_encoder, top_k=5)
print(recommendations)

['B013P2ETBU', 'B003TZLMOG', 'B083RLXTLP', 'B08YGVGPFV', 'B09P38QMHM']
