# CS608 Project 2: Amazon Fashion Recommendation

## Text Modality - Other models

### Setup

In [1]:
!pip install --quiet cornac

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import sys 
import cornac
import numpy as np
import pandas as pd

from cornac.eval_methods import RatioSplit, BaseMethod
from cornac.models import CTR, HFT, CDL
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

%tensorflow_version 1.x
import tensorflow as tf

SEED = 42
VERBOSE = True

TensorFlow 1.x selected.


In [4]:
# Read in reviews and metadata

reviews_train = pd.read_csv('/content/gdrive/MyDrive/MITB/CS608/Project 2/train_review.csv')
reviews_test_seen = pd.read_csv('/content/gdrive/MyDrive/MITB/CS608/Project 2/test_seen_review.csv')
meta = pd.read_csv('/content/gdrive/MyDrive/MITB/CS608/Project 2/amazon_meta_full_43k.csv')

In [5]:
reviews_train.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,style,reviewText,summary,unixReviewTime
0,5.0,"03 22, 2016",A2MOB79WI93FDN,6040972467,"{'Size:': ' X-Large', 'Color:': ' Pink'}",love it,Five Stars,1458604800
1,3.0,"11 19, 2016",A16O44MQ3ZS7CL,6040972467,"{'Size:': ' 4/6', 'Color:': ' Pink'}",This was too big for me in the bust area (and ...,Eh. Lol. Too big for my little size.,1479513600
2,3.0,"09 17, 2016",A14ZLDIG0XD6AL,6040972467,"{'Size:': ' 12/14', 'Color:': ' White and Blue'}","This dress fit fairly well, except that I'm pr...",Nice dress - fairly deep V,1474070400
3,3.0,"08 21, 2016",A339XOY8DU2UWE,6040972467,"{'Size:': ' 12/14', 'Color:': ' White And Red'}","The fit on this is awesome, and I love the des...","The fit on this is awesome, and I love the design",1471737600
4,5.0,"08 19, 2016",A2BLA3KO8HGANN,6040972467,"{'Size:': ' 16', 'Color:': ' White And Red'}",I am very pleased with this dress! I wasn't su...,Perfect Comfortable Summer Dress!,1471564800


In [8]:
# Select only user_id, item_id and rating columns from reviews dataset

reviews_only_train = reviews_train[['reviewerID', 'asin', 'overall']]
reviews_only_train.columns = ['user_id', 'item_id', 'rating']

reviews_only_test_seen = reviews_test_seen[['reviewerID', 'asin', 'overall']]
reviews_only_test_seen.columns = ['user_id', 'item_id', 'rating']

reviews_only_train.head()

Unnamed: 0,user_id,item_id,rating
0,A2MOB79WI93FDN,6040972467,5.0
1,A16O44MQ3ZS7CL,6040972467,3.0
2,A14ZLDIG0XD6AL,6040972467,3.0
3,A339XOY8DU2UWE,6040972467,3.0
4,A2BLA3KO8HGANN,6040972467,5.0


In [9]:
# Convert review data to Cornac format 

reviews_dataset_train = list(reviews_only_train.to_records(index=False))
reviews_dataset_test_seen = list(reviews_only_test_seen.to_records(index=False))

In [10]:
# Convert metadata to Cornac format

meta_title = list(meta['title'].astype(str))

meta_description = list(meta['description'].astype(str))
meta_description_kw = list(meta['description_keywords'].astype(str))

meta_ids = list(meta['asin'])

review_text = list(reviews_train['reviewText'].astype(str))
review_ids_user = list(reviews_train['reviewerID'])
review_ids_item = list(reviews_train['asin'])

In [11]:
eval_metrics = [
  cornac.metrics.RMSE(),
  cornac.metrics.NCRR(k=20),
  cornac.metrics.NCRR(k=50),
  cornac.metrics.Recall(k=20),
  cornac.metrics.Recall(k=50),
  cornac.metrics.NDCG(k=20),
  cornac.metrics.NDCG(k=50),
]

### Other models

#### Collaborative Deep Learning (CDL)

In [14]:
cdl = CDL(k=50, autoencoder_structure=[100], max_iter=20, vocab_size=8000, seed=SEED, verbose=VERBOSE)

train_data = reviews_dataset_train
test_data = reviews_dataset_test_seen
docs = meta_description
item_ids = meta_ids

item_text_modality = TextModality(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(sep=" ", stop_words="english"), max_vocab=8000, max_doc_freq=0.8)

eval_method = BaseMethod.from_splits(train_data=train_data, test_data=test_data, exclude_unknowns=True, item_text=item_text_modality, verbose=VERBOSE, seed=SEED)

cornac.Experiment(eval_method=eval_method, models=[cdl], metrics=eval_metrics).run()          

#### Hidden Factors as Topics (HFT)

In [None]:
hft = cornac.models.HFT(k=20, max_iter=20, grad_iter=20, l2_reg=0.001, lambda_text=0.01, vocab_size=8000, seed=SEED, verbose=VERBOSE)

train_data = reviews_dataset_train
test_data = reviews_dataset_test_seen
docs = review_text
item_ids = review_ids_item

item_text_modality = TextModality(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(sep=" ", stop_words="english"), max_vocab=8000, max_doc_freq=0.8,)

eval_method = BaseMethod.from_splits(train_data=train_data, test_data=test_data, exclude_unknowns=True, item_text=item_text_modality, verbose=VERBOSE, seed=SEED)

cornac.Experiment(eval_method=ratio_split, models=[hft], metrics=eval_metrics).run()          

### Other analysis

Baseline CTR model (with smaller k and max_iter)

In [13]:
# Run baseline CTR model with smaller k and max_iter to compare against other models

ctr = CTR(k=10, max_iter=5, verbose=VERBOSE, seed=SEED)

train_data = reviews_dataset_train
test_data = reviews_dataset_test_seen
docs = meta_description
item_ids = meta_ids

item_text_modality = TextModality(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(sep=" ", stop_words="english"), max_vocab=8000, max_doc_freq=0.8,)

eval_method = BaseMethod.from_splits(train_data=train_data, test_data=test_data, exclude_unknowns=True, item_text=item_text_modality, verbose=VERBOSE, seed=SEED)

cornac.Experiment(eval_method=eval_method, models=[ctr], metrics=eval_metrics).run()

rating_threshold = 1.0
exclude_unknowns = True




---
Training data:
Number of users = 505047
Number of items = 42362
Number of ratings = 1053169
Max rating = 5.0
Min rating = 1.0
Global mean = 4.1




---
Test data:
Number of users = 168775
Number of items = 27914
Number of ratings = 191113
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 505047
Total items = 42362

[CTR] Training started!


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Learning completed!

[CTR] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Rating', max=191113.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=168775.0, style=ProgressStyle(description_w…



TEST:
...
    |   RMSE | NCRR@20 | NCRR@50 | NDCG@20 | NDCG@50 | Recall@20 | Recall@50 | Train (s) |  Test (s)
--- + ------ + ------- + ------- + ------- + ------- + --------- + --------- + --------- + ---------
CTR | 2.7584 |  0.0814 |  0.0825 |  0.1028 |  0.1095 |    0.1725 |    0.2046 |  380.4465 | 1719.6546



1. Using item title as corpus instead of item description

In [14]:
# Running CTR model using item title as corpus instead of description

ctr = CTR(k=10, max_iter=5, verbose=VERBOSE, seed=SEED)

train_data = reviews_dataset_train
test_data = reviews_dataset_test_seen
docs = meta_title
item_ids = meta_ids

item_text_modality = TextModality(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(sep=" ", stop_words="english"), max_vocab=8000, max_doc_freq=0.8,)

eval_method = BaseMethod.from_splits(train_data=train_data, test_data=test_data, exclude_unknowns=True, item_text=item_text_modality, verbose=VERBOSE, seed=SEED)

cornac.Experiment(eval_method=eval_method, models=[ctr], metrics=eval_metrics).run()

rating_threshold = 1.0
exclude_unknowns = True




---
Training data:
Number of users = 505047
Number of items = 42362
Number of ratings = 1053169
Max rating = 5.0
Min rating = 1.0
Global mean = 4.1




---
Test data:
Number of users = 168775
Number of items = 27914
Number of ratings = 191113
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 505047
Total items = 42362

[CTR] Training started!


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Learning completed!

[CTR] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Rating', max=191113.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=168775.0, style=ProgressStyle(description_w…



TEST:
...
    |   RMSE | NCRR@20 | NCRR@50 | NDCG@20 | NDCG@50 | Recall@20 | Recall@50 | Train (s) |  Test (s)
--- + ------ + ------- + ------- + ------- + ------- + --------- + --------- + --------- + ---------
CTR | 2.7584 |  0.0814 |  0.0825 |  0.1028 |  0.1095 |    0.1725 |    0.2046 |  367.0597 | 1722.0537



2. Running CTR model on review text instead of item description

In [15]:
# Running CTR model using user text reviews as corpus instead of itemdescription

ctr = CTR(k=10, max_iter=5, verbose=VERBOSE, seed=SEED)

train_data = reviews_dataset_train
test_data = reviews_dataset_test_seen
docs = review_text
item_ids = review_ids_item

item_text_modality = TextModality(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(sep=" ", stop_words="english"), max_vocab=8000, max_doc_freq=0.8,)

eval_method = BaseMethod.from_splits(train_data=train_data, test_data=test_data, exclude_unknowns=True, item_text=item_text_modality, verbose=VERBOSE, seed=SEED)

cornac.Experiment(eval_method=eval_method, models=[ctr], metrics=eval_metrics).run()

rating_threshold = 1.0
exclude_unknowns = True




---
Training data:
Number of users = 505047
Number of items = 42362
Number of ratings = 1053169
Max rating = 5.0
Min rating = 1.0
Global mean = 4.1




---
Test data:
Number of users = 168775
Number of items = 27914
Number of ratings = 191113
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 505047
Total items = 42362

[CTR] Training started!


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Learning completed!

[CTR] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Rating', max=191113.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=168775.0, style=ProgressStyle(description_w…



TEST:
...
    |   RMSE | NCRR@20 | NCRR@50 | NDCG@20 | NDCG@50 | Recall@20 | Recall@50 | Train (s) |  Test (s)
--- + ------ + ------- + ------- + ------- + ------- + --------- + --------- + --------- + ---------
CTR | 2.7584 |  0.0814 |  0.0825 |  0.1028 |  0.1095 |    0.1725 |    0.2046 |  346.3319 | 1723.0263



3. Extracting keywords using NLP pre-trained model instead of in-built Cornac tokenizer

In [16]:
# Running CTR model using user text reviews as corpus instead of item description

ctr = CTR(k=10, max_iter=5, verbose=VERBOSE, seed=SEED)

train_data = reviews_dataset_train
test_data = reviews_dataset_test_seen
docs = meta_description_kw
item_ids = meta_ids

item_text_modality = TextModality(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(sep=" ", stop_words="english"), max_vocab=8000, max_doc_freq=0.8,)

eval_method = BaseMethod.from_splits(train_data=train_data, test_data=test_data, exclude_unknowns=True, item_text=item_text_modality, verbose=VERBOSE, seed=SEED)

cornac.Experiment(eval_method=eval_method, models=[ctr], metrics=eval_metrics).run()

rating_threshold = 1.0
exclude_unknowns = True




---
Training data:
Number of users = 505047
Number of items = 42362
Number of ratings = 1053169
Max rating = 5.0
Min rating = 1.0
Global mean = 4.1




---
Test data:
Number of users = 168775
Number of items = 27914
Number of ratings = 191113
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 505047
Total items = 42362

[CTR] Training started!


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Learning completed!

[CTR] Evaluation started!


HBox(children=(FloatProgress(value=0.0, description='Rating', max=191113.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Ranking', max=168775.0, style=ProgressStyle(description_w…



TEST:
...
    |   RMSE | NCRR@20 | NCRR@50 | NDCG@20 | NDCG@50 | Recall@20 | Recall@50 | Train (s) |  Test (s)
--- + ------ + ------- + ------- + ------- + ------- + --------- + --------- + --------- + ---------
CTR | 2.7584 |  0.0814 |  0.0825 |  0.1028 |  0.1095 |    0.1725 |    0.2046 |  345.6827 | 1745.0642

