### Huỳnh Nhật Hào - 18520714

In [None]:

from typing import Union, Optional, Dict, List, Text, Type
import string
import collections
import logging 
import time

import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics import pairwise
from sklearn import decomposition
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
handler = logging.StreamHandler()
formmater = logging.Formatter(r'%(asctime)s - %(message)s')
handler.setFormatter(formmater)

logger = logging.getLogger()
logger.addHandler(handler)
logger.setLevel(logging.INFO)


In [None]:
class ContentBasedRS:
    """Content-based Recommender system algorithm"""

    def __init__(self, data: Dict[str, Dict[str, str]],
                vocab: Dict[str, int], logger: logging.Logger,
                num_features: int = 300 ) -> None:
        """
        Args:
            data is a dictionary contains id, movie title and review
                example: data = {'1': {'name': 'some name', 'overview': 'some overview'}}
                
            vocab: a dict of vocabularis, where keys are terms and values are
                indices in the feature matrix
            num_features: number of features after reduction using truncatedSVD

        """

        self.data = data
        self.vocab = vocab
        self.logger = logger
        self.process_data()
        self.movies_name = {value['name']: id for id, value in self.data.items()}

        # tf-idf features of all overviews
        self.content_features = None

        # dimension reduced features of tf-idf features
        self.reduced_features = None

        self.vectorizer = text.TfidfVectorizer(vocabulary = self.vocab,)
        self.num_features = num_features
        self.svd = decomposition.TruncatedSVD(n_components= num_features,
                                              random_state = 42)

    def learn_features(self, ) -> None:
        """Learn tf-idf features and vectorize all overviews and reduce 
        feature's dimension using PCA and save them into self.content_features"""

        self.logger.info('Learning tf-idf features...')
        all_overviews = [x['overview'] for _, x in self.data.items()]

        self.content_features = self.vectorizer.fit_transform(all_overviews)
        
        self.logger.info('Reducing features dimension to %d'%self.num_features)

        self.reduced_features = self.svd.fit_transform(self.content_features)

        self.logger.info('Creating linear kernel matrix...')
        self.cosine_matrix = pairwise.linear_kernel(self.reduced_features.reshape(len(all_overviews), -1))

        assert self.cosine_matrix.shape == (len(self.data), len(self.data))
        self.logger.info('Done.')

    def process_data(self) -> None:
        """replace all movies indices with new indices from 0 to len(data)
        and preprocess all overviews"""

        self.logger.info('Processing data...')
        i = 0
        new_data = collections.OrderedDict()
        for _, value in self.data.items():
            value['overview'] = self.preprocess_text(value['overview'])
            new_data[i] = value
            i+= 1
        self.logger.info('Done processing data.')
        self.data = new_data

    def preprocess_text(self, texts: str) -> str:
        """Preprocess one review with all basic steps, also replace out-of-vocab
        word with unknown token
        """
        texts = texts.lower()
        # remove puntuations
        texts = texts.translate(str.maketrans('', '', string.punctuation))
        
        texts = texts.split()
        # remove stopwords
        texts = [x for x in texts if x not in stop_words]
        # lemmatize words
        lemma = nltk.wordnet.WordNetLemmatizer()
        texts = [lemma.lemmatize(x) for x in texts]
        # replace unknown word with 'unknown'

        texts = [x if x in list(self.vocab.keys()) else 'unknown' for x in texts ]
        return ' '.join(texts)

    def recommend(self, watched: List[str], num_recommend: int) -> List[str]:
        """Recommend num_recommend movies to a user who has watched movies in 
        watched. 
        """
        names = []
        chosen_movies = []
        for movie in watched:
            if movie in self.movies_name.keys():
                names.append(movie)
            else:
                logger.warning('movie %s not founded'%movie)

        if names:
            movie_id = self.movies_name[names[0]]
            
            highest_cosine = np.argsort(self.cosine_matrix[movie_id, :])[-(num_recommend + 1):]
    
            chosen_movies = {self.data[x]['name']: self.cosine_matrix[movie_id, x] for x in reversed(highest_cosine) }

        return chosen_movies
    

In [None]:
def make_vocab(texts, num_vocab):
    """Create a vocab mapping {term: index}}"""
    texts = texts.lower()
    # remove puntuations
    texts = texts.translate(str.maketrans('', '', string.punctuation))
    
    texts = texts.split()
    # remove stopwords
    texts = [x for x in texts if x not in stop_words]
    # lemmatize words
    lemma = nltk.wordnet.WordNetLemmatizer()
    texts = [lemma.lemmatize(x) for x in texts]
    # takes num_vocab most common words
    counter = collections.Counter(texts)
    selected_words = counter.most_common(num_vocab)
    selected_words = [x[0] for x in selected_words]
    
    vocab = {w: i+1 for i, w in enumerate(selected_words)}
    vocab['_unknown_'] = 0
    
    return vocab

In [None]:
import zipfile
with zipfile.ZipFile('/content/movies_metadata.csv.zip', 'r') as zf:
    zf.extractall()

In [None]:
df = pd.read_csv('movies_metadata.csv')

# filter out duplicate id movies
df = df.fillna('')
df = df.drop_duplicates('id')

# a dictionary {id: int: {name: str, overview: str}}
overviews = {row.id: {'name': row['original_title'], 'overview': row['overview']} for i, row in df.iterrows()}
# create one text file
texts = [x['overview'] for _, x in overviews.items()]
texts = ' '.join(texts)

test_data = {key: value for i, (key, value) in enumerate(overviews.items()) if i< 1000 }


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
vocab = make_vocab(texts, 10000)

In [None]:
contentrs = ContentBasedRS(overviews, vocab, logger, 5000 )
contentrs.learn_features()

2021-11-09 07:11:46,924 - Processing data...
2021-11-09 07:14:15,589 - Done processing data.
2021-11-09 07:14:15,603 - Learning tf-idf features...
2021-11-09 07:14:16,604 - Reducing features dimension to 5000
2021-11-09 07:18:53,033 - Creating linear kernel matrix...
2021-11-09 07:19:38,388 - Done.


In [None]:
# doesn't count the first movie, because the most similar movie to a movie is
# itself. Here we do not compute Cosine similarity but the dot product, the ranking
# will not be affected since the Cosine is just the scaled version of dot product.
# Since we use dot product as similarity score, so the highest similarity
# does not guaranteed to be 1.
contentrs.recommend(['Father of the Bride Part II'], 10)

{'Father of the Bride Part II': 0.8882881402502341,
 'Lambchops': 0.3178237878002115,
 'Kuffs': 0.3033961856209353,
 'I Start Counting': 0.303208817367963,
 'George of the Jungle 2': 0.3005755427012447,
 'Babbitt': 0.29545278528812396,
 'Father of the Bride': 0.29054327272958175,
 'North to Alaska': 0.28457429391862404,
 'La magie Méliès': 0.2714272744391081,
 'Wendigo': 0.2696416684060606,
 "You're Killing Me": 0.26963828237132154}

In [None]:
contentrs.recommend(["Bye Bye Love"], 10)

{'Bye Bye Love': 0.9240431311580348,
 'The Good Mother': 0.2056164594372752,
 'Girl Most Likely': 0.2024201057449036,
 'Hände weg von Mississippi': 0.19555476169292565,
 "Murphy's Romance": 0.19232680278811984,
 'Knoflíkáři': 0.18225044838084448,
 'Сатисфакция': 0.17981392734365112,
 'Патриотическая комедия': 0.17981392734365112,
 'На семи ветрах': 0.17981392734365112,
 'Бабло': 0.17981392734365112,
 'Ι-4: Λούφα Και Απαλλαγή': 0.17981392734365112}

In [None]:
# it really can recommend other Batman movies to a user who has watched Batman!
contentrs.recommend(["Batman Forever"], 10)

{'Batman Forever': 0.8956475673460599,
 'Batman: Bad Blood': 0.4100199709230794,
 'The Dark Knight Rises': 0.373313457873253,
 'Batman: The Dark Knight Returns, Part 1': 0.3549495577711644,
 'Batman: Mask of the Phantasm': 0.3150455548164655,
 'Batman Beyond: The Movie': 0.3086748556815871,
 'Batman Returns': 0.29511565192715483,
 'Бабло': 0.2833546465644097,
 'Tupla-Uuno': 0.2833546465644097,
 'Täällä Pohjantähden alla': 0.2833546465644097,
 'Бой с Тенью': 0.2833546465644097}