## Baseline Recommender: Cosine Similarity

## import

In [1]:
import pickle
import numpy as np
import pandas as pd
from datetime import date
import json
from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
import gensim
from gensim.corpora.dictionary import Dictionary

import spacy
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
sw = stopwords.words("english")

In [9]:
with open(f'../data/metascript_df_ws.pickle', 'rb') as file:
    metascripts = pickle.load(file)

## prepare the data

In [11]:
descriptions = list(metascripts['description'].values)
scripts = list(metascripts['transcript'].values)
scripts_dict = dict(zip(descriptions, scripts))

## calculate cosine similarities

In [15]:
def chunker(iterable, chunksize):
    for i in range(0, len(list(iterable)), chunksize):
        yield iterable[i:i+chunksize]

def token_filter(token):
    return not (token.is_punct | token.is_space | token.is_stop)

nlp = spacy.load('en_core_web_md')

filtered_tokens = []
for scripts_subset in tqdm(chunker(scripts, 2), total = np.ceil(len(scripts)/2)):
    for doc in nlp.pipe(scripts_subset):
        tokens = [token.lemma_.lower() for token in doc if token_filter(token)]
        filtered_tokens.append(tokens)

  0%|          | 0/155.0 [00:00<?, ?it/s]

In [None]:
tokens_no_sw = [[token for token in tokenized_script if token not in sw] for tokenized_script in filtered_tokens]

In [None]:
scripts_counters = {description: Counter(tokenized_script) for description, tokenized_script in zip(descriptions, tokens_no_sw)}
scripts_df = pd.DataFrame.from_dict(scripts_counters, orient = 'index').fillna(0)

In [None]:
scripts_df.head()

In [None]:
cosims = cosine_similarity(scripts_df)
cosims_df = (pd.DataFrame(index=scripts_df.index, 
                          columns=scripts_df.index, 
                          data = cosims)
                .melt(var_name='other_show', 
                      value_name='cosine_similarity', 
                      ignore_index=False)
                .reset_index()
                .rename(columns = {'index':'show'})
        )

In [None]:
from jupyterthemes import jtplot
jtplot.style()

sns.histplot(cosims_df.sort_values('cosine_similarity', ascending = True));

In [None]:
(cosims_df
         [cosims_df['show'] != cosims_df['other_show']]
        .drop_duplicates(subset=['cosine_similarity'])
        .sort_values('cosine_similarity', ascending = False)[:20]
)

In [None]:
show = 'Jim Gaffigan: Comedy Monster (2021) | Transcript'
(cosims_df
     [cosims_df['show'] != cosims_df['other_show']]
     [cosims_df['show'] == show]
     .sort_values('cosine_similarity')
)

## pickle cosims_df for later use

In [None]:
with open('../data/cosims_df.pickle', 'wb') as file:
    pickle.dump(cosims_df, file)