In [8]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
import pickle

In [2]:
'''
https://medium.com/@r.kosse/building-a-embedding-recommender-system-with-python-and-lightfm-e18b3df16e88
'''

'\nhttps://medium.com/@r.kosse/building-a-embedding-recommender-system-with-python-and-lightfm-e18b3df16e88\n'

In [3]:
reviews = pd.read_json("mard/mard_reviews.json" , lines=True)
# want to convert to form: user | item | rating | timestamp
reviews=reviews.drop(['helpful', 'summary','reviewTime', 'reviewerName'],axis=1)
reviews=reviews[["reviewerID", 'amazon-id', "overall" ,"unixReviewTime", 'reviewText']]


new_column_names = {'reviewerID': 'user', 'amazon-id': 'item', 'overall': 'rating', "unixReviewTime": "timestamp", "reviewText": "text"}
reviews.rename(columns=new_column_names, inplace=True)
reviews.head()

Unnamed: 0,user,item,rating,timestamp,text
0,A1OFY4ATO7D13W,26197898,5,1355702400,Buy this album. Now. Don't worry about the re...
1,A2KH83L1F70QR8,26197898,5,1358121600,The Sudden Passion did a great job with this o...
2,A1KGXC7IRLVJR3,615205399,5,1214438400,I received this CD as a gift a few weeks ago f...
3,A1BT6LQ9NY6RO3,615205399,5,1214352000,I am a beginner and have tried a couple of med...
4,A206OKO2FE2IPL,615205399,5,1229212800,This is coming from a person that didn't belie...


In [14]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def parse_sent(review, tokenizer, remove_stopwords=False):
    '''
    Parse text into sentences
    '''
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(clean_text(raw_sentence))
            
    return sentences


In [19]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

def clean_text(text):
    '''
    Cleaning pipeline:
    - lowercase
    - remove punctuation
    - remove unnecessary whitespace
    - urls
    - remove non characters
    - remove stop words
    - lemmatization
    '''
    text = text.lower()
    
    # Removing punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    
    # remove unnecessary whitespace
    text = re.sub("\s+"," ",text)
    
    # remove urls
    pattern = re.compile(r'https?://\S+|www\.\S+')
    text = pattern.sub('', text)
    
    # remove non-character
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Joining tokens back into a sentence
    cleaned_text = " ".join(tokens)
    
    return cleaned_text

def get_review_embedding(review_text, model, embed_len=300):
    '''
    get the embedding for an individual review
    returns the mean word 2 vec score applied o each word
    '''
    
    text = clean_text(review_text)
    words = [word for word in word_tokenize(text) if word in model.wv]
    
    if len(words) == 0: 
        return np.array([]) # array of size 0
    
    word_embeddings = [model.wv[word] for word in words]
    mean_embedding = np.mean(word_embeddings, axis=0)
   
    if len(mean_embedding) != embed_len:
        return np.array([])
    
    return mean_embedding

def get_album_embedding(album_reviews, model):
    '''
    for each review given to an album, 
    return the mean embedding
    '''
    review_embeddings = [get_review_embedding(review, model) for review in album_reviews if len(word_tokenize(review)) >= 1]
    
    review_embeddings = [e for e in review_embeddings if e.size>0]
    
    
    if len(review_embeddings) == 0:
        return None
    
    return np.mean(review_embeddings, axis=0)

    
def generate_embeddings():
    w2v = w2v = Word2Vec.load("w2v_1")
    items = reviews.groupby('item')
    item_embeddings = {}
    n = len(items)
    num_none = 0

    for i, (iid, group) in enumerate(items): # group all reviews by album
        print(f"Processing item {i} of {n}; Num None: {num_none}.", end="\r")
        album_reviews = group['text']
        embedding = get_album_embedding(album_reviews, w2v)
        if embedding is None: 
            num_none += 1
        item_embeddings[iid] = embedding
    return item_embeddings
    
    
    

In [20]:
ie = generate_embeddings()

Processing item 64636 of 64637; Num None: 9.

9

In [23]:
import pickle
with open('item_embeddings.pkl', 'wb') as file:
    pickle.dump(ie, file)

In [9]:
with open('item_embeddings.pkl', 'rb') as file:
    item_embeddings = pickle.load(file)
print(len(item_embeddings))
item_embeddings = {item: e for item, e in item_embeddings.items() if e is not None}
print(len(item_embeddings))

64637
64628


In [10]:
content = pd.read_json("mard/mard_metadata.json", lines=True)
content

Unnamed: 0,price,artist-mbid,imUrl,confidence,categories,release-group-mbid,amazon-id,root-genre,title,artist,label,artist_url,first-release-year,release-mbid,songs,salesRank,related,brand
0,14.23,6ce5815b-b277-48b0-94c9-a87a8f0422b3,http://ecx.images-amazon.com/images/I/6139OLQU...,1.0,"[[CDs & Vinyl, Alternative Rock, Indie & Lo-Fi...",b05be7e9-6bc4-422b-8198-32945b0d1c76,0026197898,Alternative Rock,Southern Fashion,The Sudden Passion,Black Hearts & Pink Parts Records,/s?ie=UTF8&field-artist=The+Sudden+Passion&sea...,2012,359b564b-3db4-44ad-9481-0b45f24aa53e,[{'mbid': '308af531-5fa7-4dd8-969c-6bb7a204eb1...,,,
1,29.98,,http://ecx.images-amazon.com/images/I/51rKlhSe...,,"[[CDs & Vinyl, Alternative Rock, American Alte...",,1902593375,Alternative Rock,Become the Media,Jello Biafra,AK Press,/Jello-Biafra/e/B000APYL8M,,,,{'Music': 902130},,
2,7.18,eb0cd72f-0235-47ec-9d0b-f244a2afc3b7,http://ecx.images-amazon.com/images/I/51x8skEG...,1.0,"[[CDs & Vinyl, Alternative Rock]]",bc2acb6a-4998-3cd3-97e2-61025ed8a562,630251228X,Alternative Rock,Exile,Geoffrey Oryema,emi france,/Geoffrey-Oryema/e/B000APWJ7M,1990,2bbfcc1d-d1a3-460c-bb45-ca1b2b48c8c4,[{'mbid': '2995847a-95ce-4182-aebd-92499e98f73...,{'Music': 530408},"{'also_bought': ['B000000HOX', 'B008B2IJ02', '...",
3,16.18,,http://ecx.images-amazon.com/images/I/217zrfsw...,,"[[CDs & Vinyl, Alternative Rock]]",,B00000007O,Alternative Rock,Stories of the New West,Kent 3,Super Electro,/Kent-3/e/B000APW5GM,,,,{'Music': 776497},,
4,,b0a16d0c-3d99-487d-be88-902131ad03d3,http://ecx.images-amazon.com/images/I/51OK3665...,1.0,"[[CDs & Vinyl, Alternative Rock, Indie & Lo-Fi...",410326cf-4a38-35d3-93d9-a63f8fb77b5b,B0000004JL,Alternative Rock,Wesley Willis,Wesley Willis,Fuse Records,/Wesley-Willis/e/B000APWAAI,1995,d22e418c-679d-42c0-991f-7ca12df854e8,[{'mbid': '0dccf187-7b1e-47a0-8b44-1844b9243ee...,{'Music': 844130},"{'also_viewed': ['B00004Y6SW', 'B000002MBE', '...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65561,,,http://ecx.images-amazon.com/images/I/51gUgd65...,,"[[CDs & Vinyl, Folk]]",,B00IDQOHRK,Folk,Letters from the Moon,Eva and the Vagabond Tales,Eva and the Vagabond Tales,/s?ie=UTF8&field-artist=Eva+and+the+Vagabond+T...,,,,{'Music': 1287639},,
65562,,,http://ecx.images-amazon.com/images/I/51scxHVX...,,"[[CDs & Vinyl, Folk]]",,B00JGBB1QQ,Folk,Tales from the Bonus Round,Steve Schalchlin,Bonus Round,/s?ie=UTF8&field-artist=Steve+Schalchlin&searc...,,,,{'Music': 689056},,
65563,,67776697-0ce5-476a-a763-b3d0548bad41,http://ecx.images-amazon.com/images/I/61K-QN%2...,1.0,"[[CDs & Vinyl, Folk, Contemporary Folk]]",ec1679c5-f305-3417-8a5f-22b53daac37c,B00JH1E0K4,Folk,Echo,After Jack,Travianna Records,/s?ie=UTF8&field-artist=After+Jack&search-alia...,,8f9c7963-d86d-4f92-88d7-989448bc2aee,[{'mbid': '276034ea-e4ba-433f-bfde-c1c03753009...,{'Music': 313501},"{'also_viewed': ['B00IGL2O9K', 'B00IMUXK2K', '...",
65564,,,http://ecx.images-amazon.com/images/I/51TtZJ6-...,,"[[CDs & Vinyl, Folk]]",,B00JJCCQTI,Folk,Let's Move the World,Lila Garrett & Keaton Simons,Lila Garrett & Keaton Simons,/s?ie=UTF8&field-artist=Lila+Garrett+%26+Keato...,,,,,,


In [23]:
def euclidean_distance(embedding1, embedding2):
    return np.linalg.norm(embedding1 - embedding2)

def find_top_n_closest_items(target_item, embeddings_dict, n=10):
    distances = {}
    target_embedding = embeddings_dict.get(target_item)

    if target_embedding is None:
        return f"Embedding not found for item: {target_item}"

    for item, embedding in embeddings_dict.items():
        if item != target_item:
            distance = euclidean_distance(target_embedding, embedding)
            distances[item] = distance

    # Sort items based on distances in ascending order
    sorted_items = sorted(distances.items(), key=lambda x: x[1])

    # Return the top n closest items
    return sorted_items[:n]

In [26]:
def get_closest(item):
    item_info = content[content['amazon-id'] == item][['artist', 'title', 'root-genre']].values[0]

    top_n_closest_items = find_top_n_closest_items(item, item_embeddings)

    print(f"Top 3 closest items to '{item_info}':")
    for i, (item, distance) in enumerate(top_n_closest_items):
        info = content[content['amazon-id'] == item][['artist', 'title', 'root-genre']].values[0]
        print(f"{i+1}. {info} {distance}")


In [27]:
slayer = "B000006ZYC"
get_closest(slayer)

Top 3 closest items to '['Slayer' 'Live: Decade Of Aggression' 'Metal']':
1. ['Sacred Warrior' 'Live at Cornerstone 2001' 'Metal'] 2.151486873626709
2. ['Mayhem' 'Mediolanum Capta Est' 'Metal'] 2.2158005237579346
3. ['Black Sabbath' 'Live at Last' 'Metal'] 2.3488423824310303
4. ['Sparta' 'Live At Zona Rosa 3.19.04' 'Alternative Rock'] 2.4381980895996094
5. ['Budgie' 'Heavier Than Air: Live at BBC' 'Metal'] 2.4512858390808105
6. ["Spock's Beard" 'Live at the Whisky/Nearfest' 'Rock'] 2.4569091796875
7. ['George Clinton' "Live...and Kickin'" 'R&B'] 2.4826107025146484
8. ['N.O.T.A.' 'Give Em Enough Dope' 'Alternative Rock'] 2.4835610389709473
9. ['Iron Maiden' 'Beast Over Hammersmith' 'Metal'] 2.4838857650756836
10. ['Emerson, Lake & Palmer'
 'Welcome Back My Friends to the Show That Never Ends Ladies and Gentlemen'
 'Rock'] 2.489287853240967


In [28]:
chet = 'B00004TA4G'
get_closest(chet)

Top 3 closest items to '['Chet Baker' 'Autumn in New York' 'Jazz']':
1. ['Connie Haines' 'Kiss the Boys Goodbye' 'Pop'] 2.136223554611206
2. ['Glenn Miller'
 'Glenn Miller in Hollywood: Sun Valley Serenade & Orchestra Wives - Music From the Original Soundtracks'
 'Jazz'] 2.2679011821746826
3. ['Spike Jones' 'Musical Madness' 'Jazz'] 2.3578929901123047
4. [nan nan 'Pop'] 2.3986496925354004
5. ['Fats Waller' 'Here Tis' 'Jazz'] 2.4072611331939697
6. ['Pete Seeger' 'Folk Songs' 'Folk'] 2.409882068634033
7. ['Marie' 'Integrale Pathe 1970 - 1975' 'Pop'] 2.410788059234619
8. ['Tony Martin' 'The Best of Tony Martin: The Mercury Years' 'Pop'] 2.4423179626464844
9. ['Dave Brubeck' 'Bossa Nova U.S.A. ' 'Jazz'] 2.4443013668060303
10. ['Fats Waller' 'Handful of Fats' 'Jazz'] 2.451003313064575


In [29]:
queen = "B000E6GCX4"
get_closest(queen)

Top 3 closest items to '['Queen' 'Greatest Hits' 'Rock']':
1. ['Hollies' 'Russian Roulette' 'Pop'] 2.2795207500457764
2. ['David Hasselhoff' 'Magic Collection: David Hasselhoff' 'Pop'] 2.3149170875549316
3. ['V/A' "70's Mania" 'Rock'] 2.3160479068756104
4. ['Evelyn Champagne King' 'So Romantic' 'Dance & Electronic'] 2.353106737136841
5. ['Various Artists' 'Music of the Year: 1978' 'R&B'] 2.3551247119903564
6. ['Icehouse' 'Primitive Man' 'Alternative Rock'] 2.358474016189575
7. ['Bee Gees' 'Children of the World' 'Dance & Electronic'] 2.368941307067871
8. ['Echo & The Bunnymen' 'Crocodiles' 'Alternative Rock'] 2.3692543506622314
9. ['Import' 'Vol. 1-N.W.O.B.H.M. Rarities' 'Metal'] 2.3743627071380615
10. ['Arabesque' 'Best of V.1' 'Dance & Electronic'] 2.379255533218384


In [30]:
skynyrd = "B00004RCW1"
get_closest(skynyrd)

Top 3 closest items to '['Lynyrd Skynyrd' 'Lynyrd Skynyrd - All Time Greatest Hits' 'Rock']':
1. ['Waylon Jennings' 'Ultimate Waylon Jennings' 'Country'] 1.2009938955307007
2. ['Various' 'Monster Ballads - Platinum Edition, 2 Disc Set' 'Jazz'] 1.2048357725143433
3. [nan nan 'Rock'] 1.2303394079208374
4. ['C. W. McCall' 'Wolf Creek Pass' 'Country'] 1.263244390487671
5. ['Westlife' 'Westlife - Unbreakable: Greatest Hits 1' 'Pop'] 1.2739216089248657
6. ['Cher' "If I Could Turn Back Time: Cher's Greatest Hits" 'Pop'] 1.2954102754592896
7. ['Garth Brooks' 'Garth Brooks' 'Country'] 1.3180370330810547
8. ['Freddie Jackson' 'Greatest Hits' 'R&B'] 1.3234996795654297
9. ['Merle Travis' 'In Boston 1959' 'Country'] 1.3396061658859253
10. ['Loose Ends' 'Best of Loose Ends' 'R&B'] 1.354641079902649


In [32]:
maroon = "B000001G9O"
get_closest(maroon)

Top 3 closest items to '['Bela Bartok'
 'Bela Bartok: The 6 String Quartets - Emerson String Quartet' 'Classical']':
1. ['Arnold Schoenberg' 'Schoenberg: Complete String Quartets' 'Classical'] 1.094196081161499
2. ['Franz Schubert' 'Schubert: The Late String Quartets' 'Classical'] 1.1819310188293457
3. ['Voces String Quartet' 'Arriaga: Complete String Quartets' 'Classical'] 1.2311424016952515
4. ['Ludwig van Beethoven' 'Beethoven: Complete String Quartets' 'Classical'] 1.296218991279602
5. ['Ludwig van Beethoven' 'Beethoven: Complete String Quartets' 'Classical'] 1.2991857528686523
6. ['Ludwig Van Beethoven' 'Beethoven: The Complete String Quartets'
 'Classical'] 1.3009204864501953
7. ['Dmitri Shostakovich'
 'Shostakovich: Complete String Quartets , Piano Quintet' 'Classical'] 1.330405354499817
8. ['Antonin Dvorak' 'Dvorak: String Quartets Opp. 51 And 105' 'Classical'] 1.332666039466858
9. ['Wolfgang A. Mozart'
 'Mozart: String Quartets Nos. 21 - 23 , K. 575, 589, 590' 'Classical'] 1.3