In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
BASE_DIR = "/content/drive/MyDrive"
JSON_PATH = f"{BASE_DIR}/Digital_Music.jsonl.gz"
STOPWORDS_PATH = f"{BASE_DIR}/stopwords.txt"
PUNCT_PATH = f"{BASE_DIR}/punctuations.txt"
SAVE_DIR = f"{BASE_DIR}/processed"

WORD2VEC_PATH = f"{BASE_DIR}/glove.6B.300d.txt"

MODEL_SAVE_PATH = f"{BASE_DIR}/models/deepconn.pt"

In [30]:
import argparse
import os
import sys
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer

def process_dataset(json_path, select_cols, train_rate, csv_path):
    print('#### Read the json file...')
    if json_path.endswith('gz'):
        df = pd.read_json(json_path, lines=True, compression='gzip')
    else:
        df = pd.read_json(json_path, lines=True)

    df = df[select_cols]
    df.columns = ['userID', 'itemID', 'review', 'rating']

    df['userID'] = df.groupby(df['userID']).ngroup()
    df['itemID'] = df.groupby(df['itemID']).ngroup()

    with open(STOPWORDS_PATH) as f:
        stop_words = set(f.read().splitlines())
    with open(PUNCT_PATH) as f:
        punctuations = set(f.read().splitlines())

    def clean_review(review):
        review = review.lower()
        for p in punctuations:
            review = review.replace(p, ' ')
        review = WordPunctTokenizer().tokenize(review)
        review = [w for w in review if w not in stop_words]
        return ' '.join(review)

    df = df.drop(df[[not isinstance(x, str) or len(x) == 0 for x in df['review']]].index)
    df['review'] = df['review'].apply(clean_review)

    train, valid = train_test_split(df, test_size=1 - train_rate, random_state=3)
    valid, test = train_test_split(valid, test_size=0.5, random_state=4)

    os.makedirs(csv_path, exist_ok=True)
    train.to_csv(os.path.join(csv_path, 'train.csv'), index=False, header=False)
    valid.to_csv(os.path.join(csv_path, 'valid.csv'), index=False, header=False)
    test.to_csv(os.path.join(csv_path, 'test.csv'), index=False, header=False)

    print(f'#### Split and saved dataset as csv: train {len(train)}, valid {len(valid)}, test {len(test)}')
    print(f'#### Total: {len(df)} reviews, {len(df.groupby("userID"))} users, {len(df.groupby("itemID"))} items.')

    return train, valid, test


In [31]:
import pandas as pd

# Load only a few rows to inspect structure
df_sample = pd.read_json('/content/drive/MyDrive/Digital_Music.jsonl.gz',
                         lines=True, compression='gzip')

print(df_sample.columns.tolist())
df_sample.head()


['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase']


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5,Nice,If i had a dollar for how many times I have pl...,[],B004RQ2IRG,B004RQ2IRG,AFUOYIZBU3MTBOLYKOJE5Z35MBDA,2021-04-21 02:36:53.292,0,True
1,5,Excellent,awesome sound - cant wait to see them in perso...,[],B0026UZEI0,B0026UZEI0,AHGAOIZVODNHYMNCBV4DECZH42UQ,2011-06-15 19:52:05.000,0,True
2,5,Great service,This is a great cd. Good music and plays well....,[],B0055JSYHC,B0055JSYHC,AFGEM6BXCYHUILEOA3P2ZYBEF2TA,2021-03-15 20:06:33.006,0,True
3,1,No good,"These are not real German singers, they have a...",[],B000F9SMUQ,B000F9SMUQ,AH3OG6QD6EDJGZRVCFKV4B66VWNQ,2014-07-13 02:49:01.000,0,True
4,3,"Cool concept, so-so execution...",I first heard this playing in a Nagoya shop an...,[],B0049D1WVK,B0049D1WVK,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,2011-06-25 19:19:55.000,0,False


In [32]:
train_df, valid_df, test_df = process_dataset(
    JSON_PATH,
    ['user_id', 'asin', 'text', 'rating'],
    train_rate=0.8,
    csv_path=SAVE_DIR
)


#### Read the json file...
#### Split and saved dataset as csv: train 104340, valid 13043, test 13043
#### Total: 130426 reviews, 100944 users, 70516 items.


In [33]:
import time
import pandas as pd
import torch
from torch.utils.data import Dataset

def date(f='%Y-%m-%d %H:%M:%S'):
    return time.strftime(f, time.localtime())

def load_embedding(word2vec_file):
    with open(word2vec_file, encoding='utf-8') as f:
        word_emb = []
        word_dict = {}

        word_emb.append([0])
        word_dict['<UNK>'] = 0

        for line in f.readlines():
            tokens = line.split(' ')
            word_emb.append([float(i) for i in tokens[1:]])
            word_dict[tokens[0]] = len(word_dict)

        word_emb[0] = [0] * len(word_emb[1])
    return word_emb, word_dict

def predict_mse(model, dataloader, device):
    mse, sample_count = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            user_reviews, item_reviews, ratings = map(lambda x: x.to(device), batch)
            pred = model(user_reviews, item_reviews)
            mse += torch.nn.functional.mse_loss(pred, ratings, reduction='sum').item()
            sample_count += len(ratings)
    return mse / sample_count

class DeepCoNNDataset(Dataset):
    def __init__(self, data_path, word_dict, config, retain_rui=True):
        self.word_dict = word_dict
        self.config = config
        self.retain_rui = retain_rui
        self.PAD_WORD_idx = self.word_dict[config.PAD_WORD]
        self.review_length = config.review_length
        self.review_count = config.review_count
        self.lowest_r_count = config.lowest_review_count

        df = pd.read_csv(data_path, header=None, names=['userID','itemID','review','rating'])
        df['review'] = df['review'].apply(self._review2id)

        self.sparse_idx = set()
        user_reviews = self._get_reviews(df)
        item_reviews = self._get_reviews(df, 'itemID', 'userID')

        rating = torch.Tensor(df['rating'].to_list()).view(-1, 1)

        keep_idx = [i for i in range(len(rating)) if i not in self.sparse_idx]
        self.user_reviews = user_reviews[keep_idx]
        self.item_reviews = item_reviews[keep_idx]
        self.rating = rating[keep_idx]

    def __getitem__(self, idx):
        return self.user_reviews[idx], self.item_reviews[idx], self.rating[idx]

    def __len__(self):
        return self.rating.shape[0]

    def _review2id(self, review):
        if not isinstance(review, str):
            return []
        wids = []
        for word in review.split():
            wids.append(self.word_dict.get(word, self.PAD_WORD_idx))
        return wids

    def _get_reviews(self, df, lead='userID', costar='itemID'):
        reviews_by_lead = dict(list(df[[costar, 'review']].groupby(df[lead])))
        lead_reviews = []

        for idx, (lead_id, costar_id) in enumerate(zip(df[lead], df[costar])):
            df_data = reviews_by_lead[lead_id]
            if self.retain_rui:
                reviews = df_data['review'].to_list()
            else:
                reviews = df_data['review'][df_data[costar] != costar_id].to_list()

            if len(reviews) < self.lowest_r_count:
                self.sparse_idx.add(idx)

            reviews = self._adjust_review_list(reviews, self.review_length, self.review_count)
            lead_reviews.append(reviews)

        return torch.LongTensor(lead_reviews)

    def _adjust_review_list(self, reviews, r_length, r_count):
        reviews = reviews[:r_count] + [[self.PAD_WORD_idx]*r_length]*(r_count-len(reviews))
        reviews = [r[:r_length] + [0]*(r_length-len(r)) for r in reviews]
        return reviews


In [34]:
import torch
from torch import nn

class CNN(nn.Module):
    def __init__(self, config, word_dim):
        super().__init__()
        self.kernel_count = config.kernel_count
        self.review_count = config.review_count

        self.conv = nn.Sequential(
            nn.Conv1d(word_dim, config.kernel_count, config.kernel_size,
                      padding=(config.kernel_size-1)//2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1, config.review_length)),
            nn.Dropout(config.dropout_prob)
        )

        self.linear = nn.Sequential(
            nn.Linear(config.kernel_count * config.review_count,
                      config.cnn_out_dim),
            nn.ReLU(),
            nn.Dropout(config.dropout_prob)
        )

    def forward(self, vec):
        latent = self.conv(vec.permute(0,2,1))
        latent = self.linear(latent.reshape(-1, self.kernel_count * self.review_count))
        return latent

class FactorizationMachine(nn.Module):
    def __init__(self, p, k):
        super().__init__()
        self.v = nn.Parameter(torch.rand(p,k)/10)
        self.linear = nn.Linear(p,1, bias=True)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        linear_part = self.linear(x)
        inter_part1 = torch.mm(x, self.v) ** 2
        inter_part2 = torch.mm(x**2, self.v**2)
        pair_interactions = torch.sum(inter_part1 - inter_part2, dim=1, keepdim=True)
        pair_interactions = self.dropout(pair_interactions)
        return linear_part + 0.5 * pair_interactions

class DeepCoNN(nn.Module):
    def __init__(self, config, word_emb):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(word_emb))
        self.cnn_u = CNN(config, word_dim=self.embedding.embedding_dim)
        self.cnn_i = CNN(config, word_dim=self.embedding.embedding_dim)
        self.fm = FactorizationMachine(config.cnn_out_dim*2, 10)

    def forward(self, user_review, item_review):
        n = user_review.shape[0] * user_review.shape[1]

        user_review = user_review.reshape(n, -1)
        item_review = item_review.reshape(n, -1)

        u_vec = self.embedding(user_review)
        i_vec = self.embedding(item_review)

        user_latent = self.cnn_u(u_vec)
        item_latent = self.cnn_i(i_vec)

        concat = torch.cat((user_latent, item_latent), dim=1)
        return self.fm(concat)


In [35]:
class Config:
    PAD_WORD = "<UNK>"
    word2vec_file = WORD2VEC_PATH

    train_file = f"{SAVE_DIR}/train.csv"
    valid_file = f"{SAVE_DIR}/valid.csv"
    test_file = f"{SAVE_DIR}/test.csv"

    review_length = 40
    review_count = 10
    lowest_review_count = 1

    kernel_count = 100
    kernel_size = 3
    dropout_prob = 0.5
    cnn_out_dim = 50

    learning_rate = 2e-3
    learning_rate_decay = 0.99
    l2_regularization = 1e-5
    train_epochs = 20
    batch_size = 128

    device = "cuda" if torch.cuda.is_available() else "cpu"

    model_file = MODEL_SAVE_PATH


In [36]:
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader
import os

def train(train_dataloader, valid_dataloader, model, config, model_path):
    print(f"{date()}## Start the training")
    train_mse = predict_mse(model, train_dataloader, config.device)
    valid_mse = predict_mse(model, valid_dataloader, config.device)
    print(f"{date()}#### Initial train mse {train_mse:.6f}, validation mse {valid_mse:.6f}")

    opt = torch.optim.Adam(model.parameters(), config.learning_rate,
                           weight_decay=config.l2_regularization)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(opt, config.learning_rate_decay)

    best_loss = float("inf")

    for epoch in range(config.train_epochs):
        model.train()
        total_loss = 0
        total_samples = 0

        for batch in train_dataloader:
            user_reviews, item_reviews, ratings = map(lambda x: x.to(config.device), batch)
            pred = model(user_reviews, item_reviews)
            loss = F.mse_loss(pred, ratings, reduction='sum')

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += loss.item()
            total_samples += len(pred)

        scheduler.step()

        model.eval()
        valid_mse = predict_mse(model, valid_dataloader, config.device)
        train_loss = total_loss / total_samples
        print(f"{date()}#### Epoch {epoch}; train mse {train_loss:.6f}; valid mse {valid_mse:.6f}")

        if valid_mse < best_loss:
            best_loss = valid_mse
            torch.save(model.state_dict(), model_path)

    print(f"{date()}## Training complete!")


In [37]:
config = Config()
word_emb, word_dict = load_embedding(config.word2vec_file)

train_ds = DeepCoNNDataset(config.train_file, word_dict, config)
valid_ds = DeepCoNNDataset(config.valid_file, word_dict, config, retain_rui=False)
test_ds = DeepCoNNDataset(config.test_file, word_dict, config, retain_rui=False)

train_loader = DataLoader(train_ds, batch_size=config.batch_size, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=config.batch_size)
test_loader = DataLoader(test_ds, batch_size=config.batch_size)

model = DeepCoNN(config, word_emb).to(config.device)

os.makedirs(os.path.dirname(config.model_file), exist_ok=True)

train(train_loader, valid_loader, model, config, config.model_file)


2025-12-04 21:55:37## Start the training
2025-12-04 21:55:43#### Initial train mse 18.353775, validation mse 19.930627
2025-12-04 21:55:52#### Epoch 0; train mse 3.547456; valid mse 0.796637
2025-12-04 21:56:02#### Epoch 1; train mse 1.392096; valid mse 0.681907
2025-12-04 21:56:12#### Epoch 2; train mse 1.202635; valid mse 0.790914
2025-12-04 21:56:21#### Epoch 3; train mse 1.054365; valid mse 0.796280
2025-12-04 21:56:30#### Epoch 4; train mse 0.937446; valid mse 0.760346
2025-12-04 21:56:39#### Epoch 5; train mse 0.851450; valid mse 0.751205
2025-12-04 21:56:47#### Epoch 6; train mse 0.776468; valid mse 0.771678
2025-12-04 21:56:56#### Epoch 7; train mse 0.720963; valid mse 0.753000
2025-12-04 21:57:05#### Epoch 8; train mse 0.690743; valid mse 0.774045
2025-12-04 21:57:14#### Epoch 9; train mse 0.668069; valid mse 0.701788
2025-12-04 21:57:23#### Epoch 10; train mse 0.647937; valid mse 0.643174
2025-12-04 21:57:33#### Epoch 11; train mse 0.632460; valid mse 0.637819
2025-12-04 21:5

In [38]:
# import torch.serialization
# torch.serialization.add_safe_globals([DeepCoNN])
config = Config()
word_emb, word_dict = load_embedding(config.word2vec_file)
model = DeepCoNN(config, word_emb).to(config.device)
model.load_state_dict(torch.load(config.model_file))

# best_model = torch.load(config.model_file, weigts_only=False)
test_mse = predict_mse(model, test_loader, config.device)
print("Test MSE =", test_mse)


Test MSE = 0.9313329757620755


In [39]:
class LSTMRecommender(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, embedding_matrix=None, dropout=0.3):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(embedding_matrix)
            self.embedding.weight.requires_grad = False

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )

        self.fc = nn.Linear(hidden_dim * 2, 1)

    def encode(self, x):
        """
        x shape: (B, R, L) â†’ flatten identical to DeepCoNN
        """

        B, R, L = x.shape
        x = x.reshape(B * R, L)

        emb = self.embedding(x)

        out, (h, c) = self.lstm(emb)
        final = h[-1]


        final = final.reshape(B, R, self.hidden_dim)

        final = final.mean(dim=1)

        return final

    def forward(self, user_reviews, item_reviews):
        u = self.encode(user_reviews)
        i = self.encode(item_reviews)

        concat = torch.cat([u, i], dim=1)
        return self.fc(concat)


In [40]:
word_emb, word_dict = load_embedding(config.word2vec_file)

embedding_matrix = torch.tensor(word_emb, dtype=torch.float32)
vocab_size, embedding_dim = embedding_matrix.shape

hidden_dim = 128
num_layers = 1

lstm_model = LSTMRecommender(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    embedding_matrix=embedding_matrix,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout=0.2
).to(config.device)

print(lstm_model)


LSTMRecommender(
  (embedding): Embedding(400001, 300)
  (lstm): LSTM(300, 128, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)




In [41]:
train(train_loader, valid_loader, lstm_model, config, "lstm_recommender.pt")

2025-12-04 22:00:09## Start the training
2025-12-04 22:00:19#### Initial train mse 21.725424, validation mse 22.481798
2025-12-04 22:00:39#### Epoch 0; train mse 1.250038; valid mse 0.753593
2025-12-04 22:01:01#### Epoch 1; train mse 1.040353; valid mse 0.603189
2025-12-04 22:01:22#### Epoch 2; train mse 0.857900; valid mse 0.559598
2025-12-04 22:01:44#### Epoch 3; train mse 0.742492; valid mse 0.676079
2025-12-04 22:02:04#### Epoch 4; train mse 0.683897; valid mse 0.599923
2025-12-04 22:02:25#### Epoch 5; train mse 0.632091; valid mse 0.713930
2025-12-04 22:02:45#### Epoch 6; train mse 0.594055; valid mse 0.680540
2025-12-04 22:03:05#### Epoch 7; train mse 0.556322; valid mse 0.697598
2025-12-04 22:03:26#### Epoch 8; train mse 0.523958; valid mse 0.769524
2025-12-04 22:03:46#### Epoch 9; train mse 0.499952; valid mse 0.736893
2025-12-04 22:04:07#### Epoch 10; train mse 0.470407; valid mse 0.754257
2025-12-04 22:04:27#### Epoch 11; train mse 0.448087; valid mse 0.886634
2025-12-04 22:0

In [42]:
config = Config()
word_emb, word_dict = load_embedding(config.word2vec_file)

embedding_matrix = torch.tensor(word_emb, dtype=torch.float32)
vocab_size, embedding_dim = embedding_matrix.shape

hidden_dim = 128
num_layers = 1

lstm_model = LSTMRecommender(vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    embedding_matrix=embedding_matrix,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout=0.2
).to(config.device)

lstm_model.load_state_dict(torch.load("lstm_recommender.pt"))

# best_model = torch.load(config.model_file, weigts_only=False)
test_mse = predict_mse(model, test_loader, config.device)
print("Test MSE =", test_mse)

Test MSE = 0.8634280200418272
