# Basic recommender system

This is an example of a basic recommender system.

Load the required libraries:

In [1]:
import pandas as pd
import re
import html
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

Define some functions for use later in the code:

In [2]:
# apply a text cleaning function to column reviewText
def clean(text):
    # convert html escapes to characters
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text in code or brackets
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of whitespaces
    text = re.sub(r'\s+', ' ', text)
    # make lower case
    text = text.lower()
    
    return text.strip()

# define the recommender system process
def get_recommendations(title, cosine_sim, indices):
        global res
        # Get the index of the movie that matches the title
        idx = indices[t]
        # Get the pairwise similarity scores
        sim_scores = list(enumerate(cosine_sim[idx]))
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        # Get the scores for 10 most similar movies
        sim_scores = sim_scores[0:11]
        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]
        # Return the top 10 most similar movies
        res = data['reviewText'].iloc[movie_indices]
        res = res.to_frame()
        return res

Load the data and process, vectorise it

In [3]:
data = pd.read_json("C:\\Users\\kelvi\\Desktop\\reviews_Musical_Instruments_5.json.gz", lines=True)

# select the relevant categories for text and how we want to try and train NLP ml
data = data[['reviewText', 'overall']]

# clean the text data
data['reviewText'] = data['reviewText'].apply(clean)

# get indices
indices = pd.Series(data.index, index=data['reviewText']).drop_duplicates()

movie_plots = data['reviewText']

# use tf-idf vectoriser
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_plots)

# Generate the cosine similarity matrix - linear_kernel or cosine_similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) # weight given to word occurences
#cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # irrespective of word occurences

Next, create some phrase to use to search for in the reviews and then get top 10 reviews that are similar to this returned review

In [7]:
chkr = "great guitar" # the phrase to search for
chk = len(chkr)

t = data['reviewText'].str.contains(chkr) # find if reviews contain this phrase - TRUE/FALSE
t = t.to_frame() # make it a dataframe
t = t[t['reviewText']==True] # filter to only those that are TRUE
t = t.index[0] # return/keep only the first TRUE result

# Generate recommendations
print(" linked to: "+chkr)
print(get_recommendations("money", cosine_sim, indices))

 linked to: great guitar
                                             reviewText
317   strings are not just strings. guitar strings a...
3278  these are the best guitar strings i have used ...
316   martin makes great sounding strings. i decided...
2876                                       good strings
3302                                  very good strings
3840                                       good strings
3250  martin strings, just wish i could afford a mar...
588   .i bought a ukulele to give as a present. i kn...
1422  these acoustic strings are used on a laguna ac...
2956                                  best strings ever
4122  these acoustic strings are used on a laguna ac...
