# Content Based Recommender

In [12]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

#load movies metadata
metadata = pd.read_csv("../datasets/SamplewikiDB-v1.csv", low_memory = False)

#print first three rows
metadata.head(3)

Unnamed: 0,title,overview
0,Toronto Raptors,The Toronto Raptors are a Canadian professiona...
1,National Basketball Association,The National Basketball Association (NBA) is a...
2,Larry O'Brien Trophy,The Larry O'Brien NBA Championship Trophy is t...


In [13]:
def computeCosineSim():
    
    #define a TF-IDF vectorizer object. remove all english stop words such as 'the', 'a'
    tfidf = TfidfVectorizer(stop_words='english')

    #replace NaN values with an empty string
    metadata['overview'] = metadata['overview'].fillna('')

    #construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(metadata['overview'])

    #since we have already vectorized the matrix, we can directly take the dot prodcut to find the cosine similarity 
    #thus, we can use sklearn's linear_kernel() instead of consine_similarity() since it is faster
    #compute cos similarity matrix, which returns each article's cos siilarity score with every other article based on overview

    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    return cosine_sim, metadata



In [16]:
def getContentBasedRecs():

    #Filter it to just the ArticleNames
    articleNames =[]

    titlesArray = ['Toronto Raptors','Toronto Maple Leafs', 'Tyler, the Creator']

    #Changed from hardcoded.
    
    #input var title is array of "liked" titles that were inputted by the user through onboarding
    cosine_sim, metadata = computeCosineSim()
    similarArticles = []
    #construct a reverse map of indices and article titles
    indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
    for i in titlesArray:
        idx = indices[i]

        #get pairwise similarity scores of all article with that title
        sim_scores = list(enumerate(cosine_sim[idx]))

        #sort the scores of the 10 most similar articles
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        #get the scores of the 3 most similar articles
        sim_scores = sim_scores[1:4]

        #get the article indices
        article_indices = [i[0] for i in sim_scores]

        titleOfArticle = metadata['title'].iloc[article_indices]
        titleOfArticle = titleOfArticle.to_numpy()
        res = [article.replace(' ', '_') for article in titleOfArticle]
        
        #iterate through the list of titles of articles and append wikipedia url
        for i in range (len(titleOfArticle)):
            similarArticles.append(res[i])

    return similarArticles

similarArticles = getContentBasedRecs()

print(similarArticles)

['Toronto_Maple_Leafs', 'Atlanta_Hawks', 'Boston_Bruins', 'Toronto_Raptors', 'Boston_Bruins', 'Edmonton_Oilers', 'A_Tribe_Called_Quest', 'Run-DMC', 'Jay-z']
