In [1]:
import gzip
import json
from typing import Any, Iterator

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer

# Table of Contents

- [Step A.1](#Step-A.1)
- [Step A.2](#Step-A.2)
- [Step A.3](#Step-A.3)

In [2]:
# download the “small” 5-core dataset for the category "Digital Music"
# dataset source: https://nijianmo.github.io/amazon/index.html

!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Digital_Music_5.json.gz -P data/

--2022-02-10 20:51:39--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Digital_Music_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19408584 (19M) [application/octet-stream]
Saving to: ‘data/Digital_Music_5.json.gz’


2022-02-10 20:51:53 (1.28 MB/s) - ‘data/Digital_Music_5.json.gz’ saved [19408584/19408584]



In [3]:
# download the metadata for this dataset
# dataset source: https://nijianmo.github.io/amazon/index.html

!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_Digital_Music.json.gz -P data/

--2022-02-10 20:51:54--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_Digital_Music.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12367273 (12M) [application/octet-stream]
Saving to: ‘data/meta_Digital_Music.json.gz’


2022-02-10 20:52:03 (1.30 MB/s) - ‘data/meta_Digital_Music.json.gz’ saved [12367273/12367273]



## Step A.1

The 5-core dataset for the category "Digital Music" subset of the [Amazon Review data](https://nijianmo.github.io/amazon/index.html) in which all users and items have at least 5 reviews.

The format is one-review-per-line in JSON, with the following attributes:

- `reviewerID` - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- `asin` - ID of the product, e.g. 0000013714
- `reviewerName` - name of the reviewer
- `vote` - helpful votes of the review
- `style` - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- `reviewText` - text of the review
- `overall` - rating of the product
- `summary` - summary of the review
- `verified`- whether the review has been verified (boolean)
- `unixReviewTime` - time of the review (unix time)
- `reviewTime` - time of the review (raw)
- `image` - images that users post after they have received the product

Metadata includes descriptions, price, sales-rank, brand info, and co-purchasing links:

- `asin` - ID of the product, e.g. 0000031852
- `title` - name of the product
- `feature` - bullet-point format features of the product
- `description` - description of the product
- `price` - price in US dollars (at time of crawl)
- `imageURL` - url of the product image
- `imageURLHighRes` - url of the high resolution product image
- `related` - related products (also bought, also viewed, bought together, buy after viewing)
- `salesRank` - sales rank information
- `brand` - brand name
- `categories` - list of categories the product belongs to
- `tech1` - the first technical detail table of the product
- `tech2` - the second technical detail table of the product
- `similar_item` - similar product table
- $\dots$

In [4]:
def inspect_df(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
    """Helper method to easily inspect DataFrames."""

    print(f"shape: {df.shape}")

    return df.head(n)

In [5]:
def parse(filepath: str) -> Iterator[dict]:
    file_obj = gzip.open(filepath, "rb")
    for line in file_obj:
        yield json.loads(line)

In [6]:
def file_to_dataframe(filepath: str) -> pd.DataFrame:
    i = 0
    df = {}
    for d in parse(filepath):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")

In [7]:
review_data = file_to_dataframe("data/Digital_Music_5.json.gz")

inspect_df(review_data)

shape: (169781, 12)


Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,5.0,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,5.0,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,4.0,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,5.0,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


In [8]:
list(review_data.columns)

['overall',
 'vote',
 'verified',
 'reviewTime',
 'reviewerID',
 'asin',
 'style',
 'reviewerName',
 'reviewText',
 'summary',
 'unixReviewTime',
 'image']

In [9]:
review_data.loc[2]

overall                                                         5.0
vote                                                            NaN
verified                                                       True
reviewTime                                              02 11, 2014
reviewerID                                           A2VAMODP8M77NG
asin                                                     3426958910
style                                      {'Format:': ' Audio CD'}
reviewerName                                                 JTGabq
reviewText        It was great to hear the old stuff again and I...
summary                                 SLAYER!!!!!!!!!!!!!!!!!!!!!
unixReviewTime                                           1392076800
image                                                           NaN
Name: 2, dtype: object

In [10]:
metadata = file_to_dataframe("data/meta_Digital_Music.json.gz")

inspect_df(metadata)

shape: (74347, 19)


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,[],,[],,Master Collection Volume One,"[B000002UEN, B000008LD5, B01J804JKE, 747403435...",,John Michael Talbot,[],"58,291 in CDs & Vinyl (","[B000002UEN, B000008LD5, 7474034352, B000008LD...","<img src=""https://images-na.ssl-images-amazon....",,,$18.99,1377647,[],[],
1,[],,[],,Hymns Collection: Hymns 1 &amp; 2,"[5558154950, B00014K5V4]",,Second Chapter of Acts,[],"93,164 in CDs & Vinyl (","[B000008KJ3, B000008KJ0, 5558154950, B000UN8KZ...","<img src=""https://images-na.ssl-images-amazon....",,,,1529145,[],[],
2,[],,[],,Early Works - Don Francisco,"[B00004RC05, B003H8F4NA, B003ZFVHPO, B003JMP1Z...",,Don Francisco,[],"875,825 in CDs & Vinyl (","[B003H8F4NA, B003ZFVHPO, B003JMP1ZK, B00004RC0...","<img src=""https://images-na.ssl-images-amazon....",,,,1527134,[],[],
3,[],,[],,So You Wanna Go Back to Egypt,"[B0000275QQ, 0001393774, 0001388312, B0016CP2G...",,Keith Green,[],"203,263 in CDs & Vinyl (","[B00000I7JO, B0016CP2GS, 0001393774, B0000275Q...","<img src=""https://images-na.ssl-images-amazon....",,,$13.01,1388703,[],[],
4,[],,[1. Losing Game 2. I Can't Wait 3. Didn't He S...,,Early Works - Dallas Holm,"[B0002N4JP2, 0760131694, B00002EQ79, B00150K8J...",,Dallas Holm,[],"399,269 in CDs & Vinyl (","[B0002N4JP2, 0760131694, B00150K8JC, B003MTXNV...","<img src=""https://images-na.ssl-images-amazon....",,,,1526146,[],[],


In [11]:
list(metadata.columns)

['category',
 'tech1',
 'description',
 'fit',
 'title',
 'also_buy',
 'tech2',
 'brand',
 'feature',
 'rank',
 'also_view',
 'main_cat',
 'similar_item',
 'date',
 'price',
 'asin',
 'imageURL',
 'imageURLHighRes',
 'details']

In [12]:
metadata[metadata["asin"] == review_data.loc[2]["asin"]]

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
343,[],,[],,Slayer - Greatest Hits 2 CD Set,[],,Slayer,[],"231,252 in CDs &amp; Vinyl (","[B00EBJ62GC, B00BV0W7ZE, B000062YAY, B00EBWHW5...","<img src=""https://images-na.ssl-images-amazon....",,,,3426958910,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [13]:
# for the content-based RecSys, we need both the review rating & title, description attrs - so an inner join
data = pd.merge(review_data, metadata, how="inner", on="asin")

inspect_df(data)

shape: (2431, 30)


Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,...,feature,rank,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details
0,5.0,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,...,[],"231,252 in CDs &amp; Vinyl (","[B00EBJ62GC, B00BV0W7ZE, B000062YAY, B00EBWHW5...","<img src=""https://images-na.ssl-images-amazon....",,,,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
1,5.0,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,...,[],"231,252 in CDs &amp; Vinyl (","[B00EBJ62GC, B00BV0W7ZE, B000062YAY, B00EBWHW5...","<img src=""https://images-na.ssl-images-amazon....",,,,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,5.0,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,...,[],"231,252 in CDs &amp; Vinyl (","[B00EBJ62GC, B00BV0W7ZE, B000062YAY, B00EBWHW5...","<img src=""https://images-na.ssl-images-amazon....",,,,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
3,4.0,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,...,[],"231,252 in CDs &amp; Vinyl (","[B00EBJ62GC, B00BV0W7ZE, B000062YAY, B00EBWHW5...","<img src=""https://images-na.ssl-images-amazon....",,,,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
4,5.0,,True,"01 7, 2015",A3HUD6U7RWX8E8,3426958910,{'Format:': ' Audio CD'},Kevin Ross,Excellent.,Five Stars,...,[],"231,252 in CDs &amp; Vinyl (","[B00EBJ62GC, B00BV0W7ZE, B000062YAY, B00EBWHW5...","<img src=""https://images-na.ssl-images-amazon....",,,,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


## Step A.2

All users (aka reviewers) and items (aka products) have at least 5 reviews.

Our objective is to construct “item profiles” for the items, based on information available on their metadata.

In [14]:
content = data.copy()

In [15]:
content["title"].map(lambda x: isinstance(x, str)).value_counts()

True    2431
Name: title, dtype: int64

In [16]:
content["description"].map(lambda x: isinstance(x, list)).value_counts()

True    2431
Name: description, dtype: int64

In [17]:
content["description"]

0                []
1                []
2                []
3                []
4                []
           ...     
2426    [Brand new]
2427    [Brand new]
2428    [Brand new]
2429    [Brand new]
2430    [Brand new]
Name: description, Length: 2431, dtype: object

In [18]:
content[content["description"].map(len) > 1]["description"]

15                                          [Book by, , ]
16                                          [Book by, , ]
17                                          [Book by, , ]
18                                          [Book by, , ]
19                                          [Book by, , ]
                              ...                        
1489    [2 complete shows, once missing reels filled w...
1490    [2 complete shows, once missing reels filled w...
1491    [2 complete shows, once missing reels filled w...
1492    [2 complete shows, once missing reels filled w...
1493    [2 complete shows, once missing reels filled w...
Name: description, Length: 202, dtype: object

In [19]:
content.loc[1490]["description"]

['2 complete shows, once missing reels filled with never-before-heard material, a previously unheard and quite majestic 1969 "Dark Star," a first-ever version of "New Speedway Boogie," a second-ever version of "Mason\'s Children". The first major return of material to the vault since 2005\'s Houseboat Tapes, <i>Dave\'s Picks Volume 6</i> features 2 complete shows with never-before-heard material from 2/2/70 Fox Theater, St. Louis, MO and 12/20/69 Fillmore Auditorium, San Francisco, CA. Long a mystery, the riddle of the 2/2/70 set list has now been solved and the music restored to its proper - and spectacular - sequence. Pigpen fiends, ask and you shall receive. Both shows feature loads of Pigpen including a major 35+ minute "Lovelight" from 12/20/69. Recorded by Bear with an excellent mix and sound quality, this very special release feels like a return to the classics.',
 '<b>Track listing:</b>',
 '<b>Disc 1:</b>',
 '<i>Fox Theatre, St. Louis, MO 2/2/70:</i>',
 '1. Casey Jones',
 '2. M

In [20]:
def concatenate_list_field(field: list) -> str:

    if not isinstance(field, list):
        return TypeError

    return " ".join(field)

In [21]:
content["description"] = content["description"].map(concatenate_list_field)

In [22]:
transformer = FeatureUnion(
    [
        (
            "title_tfidf",
            Pipeline(
                [
                    (
                        "extract_field",
                        FunctionTransformer(lambda x: x["title"], validate=False),
                    ),
                    ("tfidf", TfidfVectorizer()),
                ]
            ),
        ),
        (
            "description_tfidf",
            Pipeline(
                [
                    (
                        "extract_field",
                        FunctionTransformer(lambda x: x["description"], validate=False),
                    ),
                    ("tfidf", TfidfVectorizer()),
                ]
            ),
        ),
    ]
)

In [23]:
transformer.fit(content)

FeatureUnion(transformer_list=[('title_tfidf',
                                Pipeline(steps=[('extract_field',
                                                 FunctionTransformer(func=<function <lambda> at 0x7f8264d78820>)),
                                                ('tfidf', TfidfVectorizer())])),
                               ('description_tfidf',
                                Pipeline(steps=[('extract_field',
                                                 FunctionTransformer(func=<function <lambda> at 0x7f8264d78790>)),
                                                ('tfidf',
                                                 TfidfVectorizer())]))])

In [24]:
title_vocab = transformer.transformer_list[0][1].steps[1][1].get_feature_names_out()
description_vocab = (
    transformer.transformer_list[1][1].steps[1][1].get_feature_names_out()
)

In [25]:
title_vocab[:30]

array(['02', '04', '05', '09', '10', '11', '11086', '12', '13', '17',
       '1953', '1954', '1955', '1956', '1957', '1960', '1961', '1978',
       '1983', '1986', '1987', '2002', '2003', '2009', '2010', '2011',
       '2012', '2013', '2014', '2015'], dtype=object)

In [26]:
description_vocab[:30]

array(['00', '000', '01', '02', '03', '039', '04', '05', '06', '07', '08',
       '09', '10', '100', '101', '11', '1143', '12', '12060', '13', '14',
       '15', '16', '17', '18', '19', '1960', '1964', '1967', '1969'],
      dtype=object)

In [27]:
content_tfidf = transformer.transform(content).toarray()

content_tfidf.shape

(2431, 3093)

## Step A.3

In [28]:
# Note that the tf-idf functionality in sklearn.feature_extraction.text can produce normalized vectors,
# in which case cosine_similarity is equivalent to linear_kernel, only slower.

cosine_sims = linear_kernel(content_tfidf, content_tfidf)

cosine_sims.shape

(2431, 2431)

In [29]:
type(cosine_sims)

numpy.ndarray

In [30]:
def get_recommendations(
    df: pd.DataFrame,
    column: str,
    value: str,
    cosine_similarities: np.ndarray,
    limit: int = 10,
) -> pd.DataFrame:
    """Return a dataframe of content recommendations based on TF-IDF cosine similarity.

    Args:
        df (object): Pandas dataframe containing the text data.
        column (string): Name of column used, i.e. 'title'.
        value (string): Name of title to get recommendations for.
        cosine_similarities (array): matrix with cosine similarities.
        limit (int, optional): Optional limit on number of recommendations to return.

    Returns:
        Pandas dataframe.
    """

    # Return indices for the target dataframe column and drop any duplicates
    indices = pd.Series(df.index, index=df[column]).drop_duplicates()

    # Get the index for the target value
    target_index = indices[value]

    # Get the cosine similarity scores for the target value
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))

    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)

    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1 : limit + 1]

    # Extract the tuple values
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)

    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]

    # Get the actutal recommendations
    recommendations = df[column].iloc[recommendation_indices]

    # Return a dataframe
    df = pd.DataFrame(
        list(zip(index, recommendations, scores)),
        columns=["index", "recommendation", "cosine_similarity_score"],
    )

    return df

In [31]:
get_recommendations(data, 'title', 'Live At The El Mocambo', cosine_sims)

Unnamed: 0,index,recommendation,cosine_similarity_score
0,1266,AT THE BBC,0.339637
1,1267,AT THE BBC,0.339637
2,1268,AT THE BBC,0.339637
3,1269,AT THE BBC,0.339637
4,1270,AT THE BBC,0.339637
5,1261,Queen - A Night At the Opera - The Prophet's Song,0.221807
6,1262,Queen - A Night At the Opera - The Prophet's Song,0.221807
7,1263,Queen - A Night At the Opera - The Prophet's Song,0.221807
8,1264,Queen - A Night At the Opera - The Prophet's Song,0.221807
9,1265,Queen - A Night At the Opera - The Prophet's Song,0.221807
