# Basic recommender system

This is an example of a basic recommender system.

Load the required libraries:

In [1]:
import pandas as pd
import numpy as np
import re
import html
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

Define some functions for use later in the code:

In [2]:
# apply a text cleaning function to column reviewText
def clean(text):
    # convert html escapes to characters
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text in code or brackets
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of whitespaces
    text = re.sub(r'\s+', ' ', text)
    # make lower case
    text = text.lower()
    
    return text.strip()

# define the recommender system process
def get_recommendations(title, cosine_sim, indices):
        global res
        # Get the index of the movie that matches the title
        idx = indices[t]
        # Get the pairwsie similarity scores
        sim_scores = list(enumerate(cosine_sim[idx]))
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        # Get the scores for 10 most similar movies
        sim_scores = sim_scores[0:11]
        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]
        # Return the top 10 most similar movies
        res = data['reviewText'].iloc[movie_indices]
        # make dataframe
        res = res.to_frame().reset_index()
        # get similarity scores
        dff = pd.DataFrame(sim_scores)
        dff.rename(columns={0:'index',1:'similarity'}, inplace=True)
        # combine dataframes
        res = pd.concat([res,dff], axis=1, ignore_index=True)
        res.rename(columns={0:'index',1:'reviewText',2:'index1',3:'similarity'}, inplace=True)
        res = res[['reviewText', 'similarity']]
        return res

Load the data and process, vectorise it

In [3]:
data = pd.read_json("C:\\Users\\kelvi\\Desktop\\reviews_Musical_Instruments_5.json.gz", lines=True)

# select the relevant categories for text and how we want to try and train NLP ml
data = data[['reviewText', 'overall']]

# clean the text data
data['reviewText'] = data['reviewText'].apply(clean)

# get indices
indices = pd.Series(data.index, index=data['reviewText']).drop_duplicates()

movie_plots = data['reviewText']

# use tf-idf vectoriser
tfidf = CountVectorizer(stop_words='english')#, max_features = 300) # irrespective of word occurences
#tfidf = TfidfVectorizer(stop_words='english') # weight given to word occurences

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_plots)

# to save space, we can dimensionally reduce the matrix
np.random.seed(0)
from sklearn.decomposition import TruncatedSVD
 
shrunk_matrix = TruncatedSVD(n_components=100).fit_transform(tfidf_matrix)

# normalise the SVD output
from sklearn.preprocessing import normalize
shrunk_norm_matrix = normalize(shrunk_matrix)

# save vectoriser
#joblib.dump(tfidf_matrix,"vec.pkl")

# load vectoriser
#tfidf_matrix = joblib.load("vec.pkl")

# Generate the cosine similarity matrix - linear_kernel or cosine_similarity
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) 
#cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) 

# cosine sim of shrunk matrix
cosine_sim = shrunk_norm_matrix @ shrunk_norm_matrix.T

# save cosine sim
#joblib.dump(cosine_sim, "sim_matrix.pkl")

# load cosine sim
#cosine_sim = joblib.load("sim_matrix.pkl")

Next, create some phrase to use to search for in the reviews and then get top 10 reviews that are similar to this returned review

In [4]:
chkr = "great guitar" # the phrase to search for
chk = len(chkr)

t = data['reviewText'].str.contains(chkr) # find if reviews contain this phrase - TRUE/FALSE
t = t.to_frame() # make it a dataframe
t = t[t['reviewText']==True] # filter to only those that are TRUE
t = t.index[0] # return/keep only the first TRUE result

# Generate recommendations
print(" linked to: "+chkr)
print(get_recommendations(chkr, cosine_sim, indices))

 linked to: great guitar
                                           reviewText  similarity
0   strings are not just strings. guitar strings a...    1.000000
1   not very substantial. it began to tear on my g...    0.879715
2   i bought a guitar and for a while used the str...    0.869009
3   these strings are beyond amazing. switching to...    0.857657
4   i'm primarily a guitar player, but i do have a...    0.853160
5   these strings are a little more money than the...    0.849725
6   i tried these strings on my zager guitar. the ...    0.840344
7   sturdy, well-designed and frets the strings we...    0.837488
8   these are the best guitar strings i have used ...    0.826279
9   i am yet another one of those "forever beginne...    0.820800
10  these are great guitar strings and are all tha...    0.820313
