In [5]:
import pandas as pd
import numpy as np
import shutil
import tempfile
import urllib.request
from urllib.parse import quote 
import requests
from bs4 import BeautifulSoup
import sys
from IPython.core.interactiveshell import InteractiveShell
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
InteractiveShell.ast_node_interactivity = "all"

pd.set_option("display.max_rows", None, "display.max_columns", None)
np.set_printoptions(threshold=sys.maxsize)


# scrape_df = scrape_df.drop(['query'], axis=1)
resource_catalog = pd.read_csv("resource_catalog_with_FOE.csv", dtype=object)
#resource_catalog['title'] = resource_catalog['title'].str.strip()

# NLP Recommender
#----------------
resource_catalog['title'].head()

# exctract features from title
# compute the word vectors of each title
# compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each document. 
# Each column represents a word in the title vocabulary, each column represents a title


# create vocabularies
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(resource_catalog['title'])
tfidf_matrix.shape

tfidf.get_feature_names()[3000:3010]


# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim.shape
cosine_sim[1]

# Assign indices to titles
indices = pd.Series(resource_catalog.index, index=resource_catalog['title'])
indices[:10]

0                       pathology of the head and neck
1    bacterial pathogenesis a subject collection fr...
2    the use of the laryngoscope in diseases of the...
3    oxford american handbook of physical medicine ...
4                                             diogenes
Name: title, dtype: object

(24755, 15738)

['colombo',
 'colonial',
 'colonialism',
 'colonialists',
 'colonies',
 'colonisation',
 'colonisers',
 'colonists',
 'colonized',
 'colonizer']

(24755, 24755)

array([0.        , 1.        , 0.        , 0.08732155, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.07141624,
       0.        , 0.        , 0.        , 0.        , 0.     

title
pathology of the head and neck                                                                  0
bacterial pathogenesis a subject collection from cold spring harbor perspectives in medicine    1
the use of the laryngoscope in diseases of the throat with an appendix on rhinoscopy            2
oxford american handbook of physical medicine and rehabilitation                                3
diogenes                                                                                        4
post colonial studies the key concepts                                                          5
feminist cultural studies                                                                       6
transnational tourism experiences at gallipoli                                                  7
organisational behaviour core concepts and applications                                         8
gm                                                                                              9
dtype: int64

In [6]:
# Function takes in book title as input and outputs the courses associated with the most similar books
def get_recommendations(title, cosine_sim=cosine_sim):
#    print(title)
    # Get the index of the title that matches the input
    idx = indices[title]
    
    # Get the pairwsie similarity scores of all titles with that title
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar titles and output the courses
    sim_scores = sim_scores[1:11]

    # Get the title indices
    title_indices = [i[0] for i in sim_scores]

    print("Suggested courses for", title)
    length = len("Suggested courses for ") + len(title)
    i = 0
    while i < length:
        print('-', end="")
        i = i + 1
    # Return up to 10 courses
    return resource_catalog["course_name"].iloc[title_indices].drop_duplicates()



In [7]:
title = 'feminist cultural studies'
get_recommendations(title)

Suggested courses for feminist cultural studies
-----------------------------------------------

2502     communications and cultures in the global era
595                                ideas and identity 
347                             advanced criminal law 
2348                            doing cultural studies
281                            communication research 
2790                                 the public sphere
9412            food and drink in contemporary society
15529                                  youth cultures 
21290                        contemporary media theory
7372              people corporates and globalisation 
Name: course_name, dtype: object

In [8]:
title = 'computer security and cryptography'
get_recommendations(title)

Suggested courses for computer security and cryptography
--------------------------------------------------------

4408             computer security 
23112     it professional practice 
23989     web and mobile computing 
4852                 web analytics 
Name: course_name, dtype: object