In [1]:
import sys
sys.executable

import pandas as pd
import spacy

In [2]:
DATAPATH = 'data/movies_metadata.csv'

df = pd.read_csv(DATAPATH)
df = df[~df.overview.isna()]
df.rename(columns={'overview': 'sentence'}, inplace=True)
df = df.iloc[:20000]

  df = pd.read_csv(DATAPATH)


In [3]:
##TF-IDF Modeling

# Embedded Matrices - 100 movie description sentences and embed vector size = 300
# then the matrix size is [100, 300]

# When user inputs a sentence, we embed its query sentence into a 300-dim vector
# with the same model and compute cos distance between the query vector and the 100 rows

In [4]:
# We first need to clean the sentence of un-needed words, numbers, dirty and alphanumeric
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

STOPWORDS = set(stopwords.words('english'))
MIN_WORDS = 4
MAX_WORDS = 200

PATTERN_S = re.compile("\'s")  # matches `'s` from text  
PATTERN_RN = re.compile("\\r\\n") #matches `\r` and `\n`
PATTERN_PUNC = re.compile(r"[^\w\s]") # matches all non 0-9 A-z whitespace 
PATTERN_NUM = re.compile(r'[0-9]')

def clean_text(text):
    """
    Series of cleaning numbers, lower cases and removing stopwords
    return text (modified string)
    """
    if text: # ensures we have text
        text = re.sub(PATTERN_NUM, '', text)
        text = text.lower()
        text = re.sub(PATTERN_S, ' ', text)
        text = re.sub(PATTERN_RN, ' ', text)
        text = re.sub(PATTERN_PUNC, ' ', text)
        return text

def tokenizer(sentence, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True):
    """
    Lemmatize, tokenize, crop and remove stop words
    """
    if lemmatize:
        stemmer = WordNetLemmatizer()
        tokens = [stemmer.lemmatize(w) for w in word_tokenize(sentence)]
    else:
        tokens = [w for w in word_tokenize(sentence)]
    tokens = [w for w in tokens if (len(w) > min_words and len(w) < max_words
                                   and w not in stopwords)]
    return tokens
    
def clean_sentences(df):
    """
    Remove irrelavant chars in new column clean_sentence
    Lemmatize, tokenize words into list of words in column tok_lem_sentence
    """
    print("Cleaning sentences...")
    df['clean_sentence'] = df['sentence'].apply(clean_text)
    df['tok_lem_sentence'] = df['clean_sentence'].apply(
        lambda x: tokenizer(x, min_words=MIN_WORDS, max_words=MAX_WORDS, stopwords=STOPWORDS, lemmatize=True))
    return df

df = clean_sentences(df)

Cleaning sentences...


In [5]:
print(len(df))
df[['sentence', 'clean_sentence', 'tok_lem_sentence']]

20000


Unnamed: 0,sentence,clean_sentence,tok_lem_sentence
0,"Led by Woody, Andy's toys live happily in his ...",led by woody andy toys live happily in his r...,"[woody, happily, birthday, brings, lightyear, ..."
1,When siblings Judy and Peter discover an encha...,when siblings judy and peter discover an encha...,"[sibling, peter, discover, enchanted, board, m..."
2,A family wedding reignites the ancient feud be...,a family wedding reignites the ancient feud be...,"[family, wedding, reignites, ancient, neighbor..."
3,"Cheated on, mistreated and stepped on, the wom...",cheated on mistreated and stepped on the wom...,"[cheated, mistreated, stepped, woman, holding,..."
4,Just when George Banks has recovered from his ...,just when george banks has recovered from his ...,"[george, recovered, daughter, wedding, receive..."
...,...,...,...
20131,"After a lifetime of hiding, Chely Wright becom...",after a lifetime of hiding chely wright becom...,"[lifetime, hiding, chely, wright, becomes, fir..."
20132,"In 1989, five black and Latino teenagers from ...",in five black and latino teenagers from harl...,"[black, latino, teenager, harlem, arrested, la..."
20133,Arkin escapes with his life from the vicious g...,arkin escapes with his life from the vicious g...,"[arkin, escape, vicious, collector, entrapment..."
20134,"Remake of a hit film from 1990, ""The Cherry Or...",remake of a hit film from the cherry orchar...,"[remake, cherry, orchard, nakahara, directed, ..."


In [6]:
# query sentence to compare cos distances against
query_sentence = 'a crime story with a beautiful woman'
pd.options.display.max_colwidth = 500

In [7]:
# extract best indices using cos distance
def extract_best_indices(mat, topk, mask=None):
    """
    Use sum of the cos distance over all tokens.
    m (Numpy 2D array): cos distance matrix of shape (nb_in_tokens, nb_dict_tokens)
    topk (int): number of indices to return (from high to low in order)
    """
    print("--------Extract best indices:--------")
    print("mat.shape:")
    print(mat.shape)
    print("\n")
    
    #return the sum on all tokens of cosines for each sentence
    if len(mat.shape) > 1:
        cos_sim = np.mean(mat, axis=0)
    else:
        cos_sim = mat
        
    # sort indices from high to low
    index = np.argsort(cos_sim)[::-1]
    if mask is not None:
        assert mask.shape == mat.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
        
    # rm all 0 cosine distance
    mask = np.logical_or(cos_sim[index] != 0, mask) # elim 0 cos dist
    best_index = index[mask][:topk]
    return best_index
        

In [8]:
# Trainmasking the TF-IDF Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# adapt stop words
token_stop = tokenizer(' '.join(STOPWORDS), lemmatize=False)

# Fit the TFIDF - fit_transform() calculates the IDF and return doc_term matrix
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)
tfidf_mat = vectorizer.fit_transform(df['sentence'].values) # -> (num_sentences, num_vocabulary)
tfidf_mat.shape

(20000, 44327)

In [9]:
# run predictions on the query sentence
def get_recommendations_tfidf(sentence, tfidf_mat, topn):
    tokens = [str(tok) for tok in tokenizer(sentence)]
    print("Tokens size and value:")
    print(len(tokens))
    print(tokens)
    print("\n")
    vec = vectorizer.transform(tokens)
    print("Vectorizer transform from input sentence:")
    print(vec.shape)
    print(vec)
    print("\n")
    
    # create similarity list between the query and the dataset
    mat = cosine_similarity(vec, tfidf_mat)
    print("Similarity matrix:")
    print(mat.shape)
    print(mat)
    print("\n")
    
    # extract the top 3 indices (ie rows giving the best cosine dist)
    best_idx = extract_best_indices(mat, topk=topn)
    return best_idx

# get the topn = 3 recommendations 
best_index = get_recommendations_tfidf(query_sentence, tfidf_mat, 3)
print("best_index:")
print(best_index)
print("\n")

# get the values from the DF for those indices
display(df[['original_title', 'genres', 'sentence']].iloc[best_index])

Tokens size and value:
4
['crime', 'story', 'beautiful', 'woman']


Vectorizer transform from input sentence:
(4, 44327)
  (0, 9461)	1.0
  (1, 37981)	1.0
  (2, 4187)	1.0
  (3, 43611)	1.0


Similarity matrix:
(4, 20000)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.06682396 0.        ]
 [0.         0.         0.         ... 0.08708068 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


--------Extract best indices:--------
mat.shape:
(4, 20000)


best_index:
[14349  8977 10463]




Unnamed: 0,original_title,genres,sentence
14402,Michael Jackson: Life of a Superstar,"[{'id': 99, 'name': 'Documentary'}, {'id': 10770, 'name': 'TV Movie'}]",The Story of the King of Pop
9003,Innocent Blood,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'name': 'Horror'}, {'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]",A beautiful vampire turns a crime lord into a creature of the night.
10493,Requiem pour un vampire,"[{'id': 27, 'name': 'Horror'}]",A vampire lures beautiful young women to his castle in Europe.


In [10]:
'''Learning SPACY Modeling'''

'Learning SPACY Modeling'

In [11]:
import spacy

# load the pretrained en_core_web_lg model
nlp = spacy.load("en_core_web_lg")

# apply the model to the input sentence
df["spacy_sentence"] = df['sentence'].apply(lambda x: nlp(x))

In [13]:
print(df["spacy_sentence"])

0                                                                                                                                       (Led, by, Woody, ,, Andy, 's, toys, live, happily, in, his, room, until, Andy, 's, birthday, brings, Buzz, Lightyear, onto, the, scene, ., Afraid, of, losing, his, place, in, Andy, 's, heart, ,, Woody, plots, against, Buzz, ., But, when, circumstances, separate, Buzz, and, Woody, from, their, owner, ,, the, duo, eventually, learns, to, put, aside, their, differences, .)
1                              (When, siblings, Judy, and, Peter, discover, an, enchanted, board, game, that, opens, the, door, to, a, magical, world, ,, they, unwittingly, invite, Alan, --, an, adult, who, 's, been, trapped, inside, the, game, for, 26, years, --, into, their, living, room, ., Alan, 's, only, hope, for, freedom, is, to, finish, the, game, ,, which, proves, risky, as, all, three, find, themselves, running, from, giant, rhinoceroses, ,, evil, monkeys, and, other, terrifyi

In [12]:
# Question: How does spacy compare against the TF-IDF model??
# Answer 1: Takes like 10 times as long!
# check python processes
import psutil

def check_process_status(process_name):
    """
    Return status of process based on process name.
    """
    process_status = [ proc for proc in psutil.process_iter() if proc.name() == process_name ]
    if process_status:
        for current_process in process_status:
            print("Process id is %s, name is %s, staus is %s"%(current_process.pid, current_process.name(), current_process.status()))
    else:
        print("Process name not valid", process_name)

In [14]:
# retrieve embedded vectors as a matrix
embed_mat = df['spacy_sentence'].values
print(embed_mat.shape)
print(embed_mat[10])

(20000,)
Widowed U.S. president Andrew Shepherd, one of the world's most powerful men, can have anything he wants -- and what he covets most is Sydney Ellen Wade, a Washington lobbyist. But Shepherd's attempts at courting her spark wild rumors and decimate his approval ratings.


In [16]:
# SPACY prediction on given input sentence
def predict_spacy(model, query_sentence, embed_mat, topk):
    """
    Predict the topk sentences using the SPACY model
    """
    # this creates a Spacy.DOC object
    query_embed = model(query_sentence)
    
    # this calculates the similarity of the input query_sentence
    # with very row of the embedded mat
    # OUTPUT - this creates a list of scores for the lines
    mat = np.array([query_embed.similarity(line) for line in embed_mat])
    
    # keep if vector has a norm
    mat_mask = np.array([True if line.vector_norm else False for line in embed_mat])
    best_index = extract_best_indices(mat, topk=topk, mask=mat_mask)
    return best_index

# run predictions on the model
print("query sentence = " + query_sentence)
best_index = predict_spacy(nlp, query_sentence, embed_mat, 3)
display(df[['original_title', 'genres', 'sentence']].iloc[best_index])

query sentence = a crime story with a beautiful woman
--------Extract best indices:--------
mat.shape:
(20000,)




  mat = np.array([query_embed.similarity(line) for line in embed_mat])


Unnamed: 0,original_title,genres,sentence
15538,In a Day,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",A young woman spends a curiously unpredictable day with a stranger.
6120,Les dames du Bois de Boulogne,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]",A society lady engineers a marriage between her lover and a cabaret dancer who is essentially a prostitute.
8474,醜聞,"[{'id': 18, 'name': 'Drama'}]",Akira Kurosawa directed this drama about a paparazzi photo that a tabloid magazine spins into a scandalous story and soon sparks a court case.


In [26]:
"""
Word2Vec Module from the GenSim package
"""

'\nWord2Vec Module from the GenSim package\n'

In [27]:
from gensim.models.word2vec import Word2Vec

# create model
word2vec_model = Word2Vec(min_count=0, workers=8, vector_size=300)

# prepare the vocab
word2vec_model.build_vocab(df.tok_lem_sentence.values)

# train the model
word2vec_model.train(df.tok_lem_sentence.values, total_examples=word2vec_model.corpus_count, epochs=30)

(12936325, 13110000)

In [32]:
# word2vec prediction
def is_word_in_model(word, model):
    """
    Check on individual words '''word''' that exists in '''model'''
    """
    assert type(model).__name__ == 'KeyedVectors'
    is_in_vocab = word in model.key_to_index.keys()
    return is_in_vocab

def predict_w2v(query_sentence, dataset, model, topk=3):
    """
    Predict word to vector using dataset, sentence and model
    """
    query_sentence = query_sentence.split()
    in_vocab_list, best_index = [], [0]*topk
    for w in query_sentence:
        # remove unseen words from query sentence
        if is_word_in_model(w, model.wv):
            in_vocab_list.append(w)
            
    # retrieve similarity between two words as a distance
    if len(in_vocab_list) > 0:
        sim_mat = np.zeros(len(dataset)) 
        for i, data_sentence in enumerate(dataset):
            if data_sentence:
                sim_sentence = model.wv.n_similarity(
                    in_vocab_list, data_sentence)
            else:
                sim_sentence = 0
            sim_mat[i] = np.array(sim_sentence)
            
        # take the 5 highest norms
        best_index = np.argsort(sim_mat)[::-1][:topk]
        
    return best_index

# run the prediction on query sentence
best_index = predict_w2v(query_sentence, df['tok_lem_sentence'].values, word2vec_model)
display(df[['original_title', 'genres', 'sentence']].iloc[best_index])

Unnamed: 0,original_title,genres,sentence
11048,Marked Woman,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}, {'id': 80, 'name': 'Crime'}]","Set in the underworld of Manhattan, Marked Woman tells the story of a woman who dares to stand up to one of the city's most powerful gangsters. The women of the story are ""hostesses"". What is implied, but not stated clearly is that they are prostitutes, who work in a gambling den in the city."
9232,苏州河,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}, {'id': 10749, 'name': 'Romance'}]",A tragic love story set in contemporary Shanghai. The film stars Zhou Xun in a dual role as two different women and Jia Hongsheng as a man obsessed with finding a woman from his past.
6916,L'Année dernière à Marienbad,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]","Takes place in a chateau, an ambiguous story of a man and a woman who may or may not have met last year at Marienbad."


In [33]:
"""
Sentence Transformers using phrases
"""
from sentence_transformers import SentenceTransformer, util

# load pretrained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading (…)001fa/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)bb8001fa/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)001fa/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b8001fa/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [34]:
corpus_embeddings = model.encode(df.sentence.values, convert_to_tensor=True)
query_embedding = model.encode(query_sentence, convert_to_tensor=True)

In [36]:
# run predictions using the pretrained model and encoding
import torch

# use cosine similarity and torch to find best scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=3)

print("\n\n======================\n\n")
print("Query:", query_sentence)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results[0], top_results[1]):
    score = score.cpu().data.numpy()
    idx = idx.cpu().data.numpy()
    display(df[['original_title', 'genres', 'sentence']].iloc[idx])





Query: a crime story with a beautiful woman

Top 5 most similar sentences in corpus:


original_title                                                                                                                         Miss Bala
genres                                                                               [{'id': 18, 'name': 'Drama'}, {'id': 28, 'name': 'Action'}]
sentence          The story of a young woman clinging on to her dream to become a beauty contest queen in a Mexico dominated by organized crime.
Name: 18224, dtype: object

original_title                                                                                                                                                                                                                                                                                                                                 In the Cut
genres                                                                                                                                                                                                                                                                                  [{'id': 9648, 'name': 'Mystery'}, {'id': 53, 'name': 'Thriller'}]
sentence          Following the gruesome murder of a young woman in her neighborhood, a self-determined woman living in New York City--as if to test the limits of her own safety--propels herself into an impossibly risky sexual liaison. Soon she grows increasingly wary about the motives of every man with who

original_title                                                                                    The World of Suzie Wong
genres                                                    [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]
sentence          Story of the love between a struggling American artist and a beautiful Chinese prostitute in Hong Kong.
Name: 8018, dtype: object

In [None]:
"""
BERT Text Classification
"""
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import numpy as np
from tqdm import tqdm

BERT_BATCH_SIZE = 4
MODEL_NAME = 'sentence-transformers/paraphrase-MiniLM-L6-v2'

class BertModel:
    def __init__(self, model_name, device=-1, small_memory=True, batch_size=BERT_BATCH_SIZE):
        self.model_name = model_name
        self._set_device(device)
        self.small_device = 'cpu' if small_memory else self.device
        self.batch_size = batch_size
        self.load_pretrained_model()
        
    def _set_device(self, device):
        if device == -1 or device == 'cpu':
            self.device = 'cpu'
        elif device == 'cuda' or device == 'gpu':
            self.device = 'cuda'
        elif isinstance(device, int) or isinstance(device, float):
            self.device = 'cuda'
        else:  # default
            self.device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
            
    def load_pretrained_model(self):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        device = -1 if self.device == 'cpu' else 0
        self.pipeline = pipeline('feature-extraction',
                                model=self.model, tokenizer=self.tokenizer, device=device)
        
    def embed(self, data):
        """
        Create embedded matrix from original sentences
        """
        nb_batches = 1 if (len(data) < self.batch_size) else len(data) // self.batch_size
        batches = np.array_split(data, nb_batches)
        mean_pooled = []
        for batch in tqdm(batches, total=len(batches), desc='Training...'):
            mean_pooled.append(self.transform(batch))
        mean_pooled_tensor = torch.tensor(len(data), dtype=float).to(self.small_device)
        mean_pooled = torch.cat(mean_pooled, out=mean_pooled_tensor)
        self.embed_mat = mean_pooled
        
    @staticmethod
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unqueeze(-1).expand(
            token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / 
            torch.clamp(input_mask_expanded.sum(1), min=1e-9)

