In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, r2_score, ndcg_score, roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive/BT4301/Dataset

/content/drive/.shortcut-targets-by-id/10UxIGmI1nRKsdfd_xARinm-FmI8_NeVG/BT4301/Dataset


# Data Preprocessing

In [4]:
def load_and_preprocess_data(review_data_path, metadata_path):
    review_data = pd.read_csv(review_data_path)
    metadata = pd.read_csv(metadata_path)
    data = pd.merge(review_data, metadata, on=['parent_asin'])

    # Preprocessing steps
    data['rating'] = data['rating'].astype(float)
    data.drop(columns=['verified_purchase'], inplace=True)
    data['price'] = pd.to_numeric(data['price'], errors='coerce')
    data['price'] = data['price'].fillna(0)
    data['price'] = data['price'].astype(float)
    data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')
    data['publication_date'] = pd.to_datetime(data['publication_date'], errors='coerce')
    data['days_since_publication'] = (dt.now() - data['publication_date']).dt.days
    data['days_since_publication'].fillna(data['days_since_publication'].mean(), inplace=True)

    # Encode categorical variables
    label_encoders = {}
    for col in ['user_id', 'parent_asin', 'genre', 'author', 'language']:
        label_encoders[col] = LabelEncoder()
        data[col] = label_encoders[col].fit_transform(data[col].fillna('Unknown'))

    # Normalize numerical features
    scaler = MinMaxScaler()
    numeric_cols = ['rating', 'average_rating', 'rating_number', 'price', 'days_since_publication']
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    # Apply TF-IDF on text data
    data[['title', 'publisher']] = data[['title', 'publisher']].fillna('Unknown')
    vectorizer_title = TfidfVectorizer(max_features=50)
    title_tfidf = vectorizer_title.fit_transform(data['title']).toarray()
    vectorizer_publisher = TfidfVectorizer(max_features=50)
    publisher_tfidf = vectorizer_publisher.fit_transform(data['publisher']).toarray()

    return data, label_encoders, title_tfidf, publisher_tfidf

In [11]:
# Data loading and preprocessing
data, label_encoders, title_tfidf, publisher_tfidf = load_and_preprocess_data(
    review_data_path="book_reviews_with_sentiment_sample10k_segmented_withuserhistory.csv",
    metadata_path="old dataset/books_metadata.csv"
)

In [5]:
def create_sequences_with_ratings(data, seq_length, title_tfidf, publisher_tfidf):
    sequences = []
    targets = []
    numeric_cols = [column for column in data.columns.tolist()
                    if data[column].dtype in ('int64', 'float64')
                    and column not in ['rating', 'user_id', 'parent_asin', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']]
    groups = data.sort_values('timestamp').groupby('user_id')

    for user_id, group in groups:
        if len(group) >= seq_length:
            for i in range(len(group) - seq_length + 1):
                numeric_seq = group[numeric_cols].iloc[i:i + seq_length].values
                title_seq = title_tfidf[group.index[i:i + seq_length]]
                publisher_seq = publisher_tfidf[group.index[i:i + seq_length]]

                # Ensure all categorical features are converted to numeric
                parent_asin_seq = group['parent_asin'].iloc[i:i + seq_length].values.reshape(-1, 1).astype(np.float32)
                genre_seq = group['genre'].iloc[i:i + seq_length].values.reshape(-1, 1).astype(np.float32)
                author_seq = group['author'].iloc[i:i + seq_length].values.reshape(-1, 1).astype(np.float32)
                language_seq = group['language'].iloc[i:i + seq_length].values.reshape(-1, 1).astype(np.float32)

                seq = np.concatenate([numeric_seq, parent_asin_seq, genre_seq,
                                      author_seq, language_seq, title_seq,
                                      publisher_seq], axis=1)
                sequences.append(seq)

                # Target is the rating for the current book
                target_val = group['rating'].iloc[i + seq_length - 1]
                targets.append(target_val)

    return torch.tensor(np.array(sequences), dtype=torch.float32), torch.tensor(np.array(targets), dtype=torch.float32)

In [12]:
# Create sequences
X, y = create_sequences_with_ratings(data, seq_length, title_tfidf, publisher_tfidf)

In [13]:
# Train-test split with time-based validation
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# LSTM Model

In [31]:
class LSTMRecommender(nn.Module):
    def __init__(self, num_books, embedding_dim, hidden_size, num_layers, input_size):
        super(LSTMRecommender, self).__init__()
        self.book_embedding = nn.Embedding(num_books, embedding_dim)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)  # Use input_size here
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        num_features = x[:, :, :156] # Numerical features
        parent_asin = x[:, :, 156].long() # parent_asin
        genre = x[:, :, 157].long() # genre
        author = x[:, :, 158].long() # author
        language = x[:, :, 159].long() # language
        title_tfidf = x[:, :, 160:210] # title_tfidf
        publisher_tfidf = x[:, :, 210:] # publisher_tfidf

        # Get embeddings for categorical features
        parent_asin_emb = self.book_embedding(parent_asin)

        # Concatenate all features
        # Ensure genre, author, and language are converted to float and have the correct dimensions
        all_features = torch.cat([num_features, parent_asin_emb, title_tfidf, publisher_tfidf,
                                  genre.unsqueeze(-1).float(), author.unsqueeze(-1).float(),
                                  language.unsqueeze(-1).float()], dim=2)

        # Pass through LSTM
        lstm_out, _ = self.lstm(all_features)

        # Get output for the last time step
        output = self.fc(lstm_out[:, -1, :])

        return output

In [32]:
def train_model(model, X_train, y_train, num_epochs, learning_rate, batch_size):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    for epoch in range(num_epochs):
        total_loss = 0  # Accumulate loss for the epoch
        num_batches = 0  # Count batches

        for i in range(0, len(X_train), batch_size):
            batch_X = X_train[i:i + batch_size]
            batch_y = y_train[i:i + batch_size]

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        avg_loss = total_loss / num_batches  # Calculate average loss for the epoch
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

In [33]:
def predict_rating(model, input_sequence):
    with torch.no_grad():
        output = model(input_sequence.unsqueeze(0)) # Add batch dimension
        predicted_rating = output.item()

    return predicted_rating

In [72]:
def evaluate_model(model, X_test, y_test, threshold=0.5):
    """Evaluates the model on the test set."""
    y_pred = [predict_rating(model, seq) for seq in X_test]

    if np.isnan(y_pred).any():
        y_pred = np.nan_to_num(y_pred)

    y_test = y_test.numpy()
    y_pred = np.array(y_pred)

    y_pred_rank = np.argsort(np.argsort(-y_pred)) + 1
    ndcg = ndcg_score(y_test.reshape(1, -1), y_pred_rank.reshape(1, -1))

    # Reshape y_test and y_pred to be 2D arrays
    y_test = y_test.reshape(-1, 1)
    y_pred = y_pred.reshape(-1, 1)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    y_test_binary = (y_test >= threshold).astype(int)
    y_pred_binary = (y_pred >= threshold).astype(int)

    precision = precision_score(y_test_binary, y_pred_binary)
    recall = recall_score(y_test_binary, y_pred_binary)
    f1 = f1_score(y_test_binary, y_pred_binary)
    auc = roc_auc_score(y_test_binary, y_pred_binary)
    accuracy = accuracy_score(y_test_binary, y_pred_binary)

    print(f"MSE: {mse}\nRMSE: {rmse}\nMAE: {mae}\nR2: {r2}\nPrecision: {precision}\nNDCG: {ndcg}\nRecall: {recall}\nF1: {f1}\nAUC: {auc}\nAccuracy: {accuracy}")

# Execution

In [39]:
embedding_dim = 64
hidden_size = 64
num_layers = 2
num_epochs = 10
learning_rate = 0.01
batch_size = 64
seq_length = 10

In [40]:
# Model instantiation and training
num_books = data['parent_asin'].nunique()
numeric_cols = [column for column in data.columns.tolist()
                if data[column].dtype in ('int64', 'float64')
                and column not in ['rating', 'user_id', 'parent_asin', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']]
input_size = len(numeric_cols) + embedding_dim + 50 + 50 + 3

model = LSTMRecommender(num_books, embedding_dim, hidden_size, num_layers, input_size)
train_model(model, X_train, y_train, num_epochs, learning_rate, batch_size)

Epoch [1/10], Loss: nan
Epoch [2/10], Loss: nan
Epoch [3/10], Loss: nan
Epoch [4/10], Loss: nan
Epoch [5/10], Loss: nan
Epoch [6/10], Loss: nan
Epoch [7/10], Loss: nan
Epoch [8/10], Loss: nan
Epoch [9/10], Loss: nan
Epoch [10/10], Loss: nan


In [73]:
# Evaluation
evaluate_model(model, X_test, y_test)

MSE: 0.8311034142140722
RMSE: 0.9116487340056323
MAE: 0.8847702750620536
R2: -16.212464569598023
Precision: 0.0
NDCG: 0.9853275702399711
Recall: 0.0
F1: 0.0
AUC: 0.5
Accuracy: 0.04559039562332202
