# Content-based

later

In [1]:
# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as scsp
from scipy.sparse import lil_matrix, csr_matrix, coo_matrix

# SKLearn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Display
import ipywidgets as w
from ipywidgets import widgets, HBox, VBox
from IPython.core.display import HTML
from movie_display import movie_display

app = w.Output()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeroen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## IMDb JSON

Import movies into a data frame and analyze the availables attributes.

9125 Movies with 18 attributes from IMDb.

Attributes:

- Actors
- ...
- Plot
- Title
- Writer
- ...
- imdbVotes

In [2]:
# Load movies into a dataframe
df = pd.read_json('./dataset/imdbdata.json', orient='columns')
df.head(5)

Unnamed: 0,Actors,Awards,Country,Director,Genre,Language,Plot,Poster,Production,Rated,Released,Runtime,Title,Writer,Year,imdbId,imdbRating,imdbVotes
0,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",Nominated for 3 Oscars. Another 23 wins & 18 n...,USA,John Lasseter,"Animation, Adventure, Comedy",English,A cowboy doll is profoundly threatened and jea...,https://images-na.ssl-images-amazon.com/images...,Buena Vista,G,22 Nov 1995,81 min,Toy Story,"John Lasseter (original story by), Pete Docter...",1995,114709,8.3,666855
1,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",4 wins & 9 nominations.,USA,Joe Johnston,"Action, Adventure, Family","English, French",When two kids find and play a magical board ga...,https://images-na.ssl-images-amazon.com/images...,Sony Pictures Home Entertainment,PG,15 Dec 1995,104 min,Jumanji,"Jonathan Hensleigh (screenplay), Greg Taylor (...",1995,113497,6.9,223000
2,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",2 wins & 2 nominations.,USA,Howard Deutch,"Comedy, Romance",English,John and Max resolve to save their beloved bai...,https://images-na.ssl-images-amazon.com/images...,Warner Home Video,PG-13,22 Dec 1995,101 min,Grumpier Old Men,"Mark Steven Johnson (characters), Mark Steven ...",1995,113228,6.6,20100
3,"Whitney Houston, Angela Bassett, Loretta Devin...",8 wins & 8 nominations.,USA,Forest Whitaker,"Comedy, Drama, Romance",English,"Based on Terry McMillan's novel, this film fol...",https://images-na.ssl-images-amazon.com/images...,Twentieth Century Fox Home Entertainment,R,22 Dec 1995,124 min,Waiting to Exhale,"Terry McMillan (novel), Terry McMillan (screen...",1995,114885,5.7,7769
4,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Nominated for 1 Golden Globe. Another 1 win & ...,USA,Charles Shyer,"Comedy, Family, Romance",English,George Banks must deal not only with the pregn...,https://images-na.ssl-images-amazon.com/images...,Disney,PG,08 Dec 1995,106 min,Father of the Bride Part II,"Albert Hackett (screenplay), Frances Goodrich ...",1995,113041,5.9,27815


## Feature Extraction

Pipeline: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn

Excluding stopwords and stemming words according to the NLTK English language dictionaries.

### Stopwords

Frequent words with no meaning. Like: 'as', 'you', 'the'.

### Stemming

Useful but also transforms correct words into incorrect ones. Like `james` to `jame` and `territory` to `terri`.

### Features

- Plot
- Title
- Genre
- Actors
- Rated (movies with a similiar MPAA rating)
- Year (movies in a similiar time frame)

In [3]:
features = df.copy()

# Get the stopwords for the English language
useless_words = stopwords.words('english')
# Create a stemmer for the English language
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print(useless_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [4]:
mpaa_ratings = ['G', 'PG', 'PG-13', 'R', 'NC-17', 'NR']
mpaa_synonyms = [
    (['N/A', 'NOT RATED', 'UNRATED'], 'NR'),     # not rated
    (['APPROVED', 'PASSED', 'GP', 'TV-G'], 'G'), # general public
    (['TV-PG'], 'PG'),                           # parental guidance
    (['TV-14'], 'PG-13'),                        # parental guidance under 13
    (['M/PG', 'X'], 'R'),                        # restricted
    (['M', 'TV-MA'], 'NC-17')                    # mature
]

# all available year ranges
years_range = ['190','191','192','193','194','195','196','197','198','199','200','201']

plots = []
genres = []
titles = []
actors = []
rated = []
years = []
for row in df.itertuples():
    # Plot
    plots.append(' '.join([stemmer.stem(w) for w in row.Plot.split()]))
    # Title
    titles.append(' '.join([w for w in row.Title.split() if w.lower() not in useless_words]))
    # Genre
    genres.append(' '.join(row.Genre.split(', ')))
    # Actor
    actors.append(' '.join([actor.replace(' ', '') for actor in row.Actors.split(', ')]))
    
    # Rated
    mpaa = row.Rated
    for synonyms, replacement in mpaa_synonyms:
        if mpaa in synonyms:
            mpaa = replacement
            break
    rated.append(mpaa)
    
    # Year
    year = row.Year[:3]
    year_i = years_range.index(year)
    years.append(' '.join(years_range[max(0, year_i - 1):min(year_i + 2, len(years_range))]))

features['PlotStripped'] = plots
features['TitleStripped'] = titles
features['GenreStripped'] = genres
features['ActorsStripped'] = actors
features['RatedStripped'] = rated
features['YearsRange'] = years

In [5]:
years_range[max(0, 3 - 1):min(3 + 2, len(years_range))]

['192', '193', '194']

#### Count Vectorizer

Finding the frequencies of words in the plots using a bag of words approach and transforming the term count to term frequency.

We do not use this approach for the plot anymore because the `TfidfVectorizer` class does both steps in one. The data contains 17354 unique stemmed words in the plots (excluding stopwords).

In [6]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit(features.PlotStripped).transform(features.PlotStripped)

tfidf_transformer = TfidfTransformer()
tfidf_transformed = tfidf_transformer.fit_transform(bag_of_words)
print(tfidf_transformed.shape)

(9125, 17364)


One-hot encoding the genres using a count vectorizer. The data contains 26 unique genres.

In [7]:
vectorizer = CountVectorizer()
tf_genres = vectorizer.fit(features.GenreStripped).transform(features.GenreStripped)
print(tf_genres.shape)

(9125, 26)


In [8]:
vectorizer = CountVectorizer()
tf_title = vectorizer.fit(features.TitleStripped).transform(features.TitleStripped)
print(tf_title.shape)

(9125, 7219)


In [9]:
vectorizer = CountVectorizer()
tf_actors = vectorizer.fit(features.ActorsStripped).transform(features.ActorsStripped)
print(tf_actors.shape)

(9125, 16476)


In [10]:
vectorizer = CountVectorizer()
tf_rated = vectorizer.fit(features.RatedStripped).transform(features.RatedStripped)
print(tf_rated.shape)

(9125, 5)


In [11]:
vectorizer = CountVectorizer()
tf_years = vectorizer.fit(features.YearsRange).transform(features.YearsRange)
print(tf_years.shape)

(9125, 12)


#### Tf-Idf
Finding the frequencies of words in the plots directly using the Tf-Idf vectorizer.

In [12]:
vectorizer = TfidfVectorizer()
tfidf_plots = vectorizer.fit_transform(plots)

print("  (movie_index, word_index)\tfrequency\n")
print(tfidf_plots)

  (movie_index, word_index)	frequency

  (0, 3704)	0.27616316280141384
  (0, 4709)	0.2907707139036451
  (0, 8154)	0.08861243629796914
  (0, 12263)	0.30086980273614133
  (0, 15560)	0.21421167531111496
  (0, 847)	0.06306452640924701
  (0, 8277)	0.27231068792004265
  (0, 16931)	0.11951833651503
  (0, 10736)	0.13593026160796712
  (0, 14522)	0.3628212902264401
  (0, 5902)	0.26408966050379634
  (0, 15082)	0.3472138745077899
  (0, 7326)	0.13930848872336773
  (0, 1140)	0.11927678631495263
  (0, 15705)	0.23254723029401642
  (0, 15769)	0.28277813002689384
  (0, 7780)	0.07103009016059907
  (0, 2154)	0.16906094986235187
  (0, 13305)	0.246557212498607
  (1, 847)	0.12286954448202522
  (1, 16931)	0.11642966657320275
  (1, 7780)	0.06919448475609502
  (1, 16060)	0.13226283349211293
  (1, 8606)	0.21655311216101447
  (1, 5928)	0.12452827184532418
  :	:
  (9124, 7780)	0.06111515953479166
  (9124, 11021)	0.16030088801878004
  (9124, 15479)	0.10289466169388105
  (9124, 15483)	0.0913347775713955
  (9124, 156

## Movie Recommendations v1

In [13]:
class Recommender():

    def __init__(self, n_recommendations=5):
        self.nbrs = NearestNeighbors(n_neighbors=n_recommendations + 1, algorithm='auto', metric='cosine')
        self.n_recommendations = n_recommendations

    def fit(self, features):
        self.nbrs.fit(features)
        self.distances, self.graph = self.nbrs.kneighbors(features)

    def recommend(self, movie_index):
        # We don't want to include the same movie so we exclude it
        return self.graph[movie_index, 1:]

In [14]:
# Concatenate features
concat = scsp.hstack([tf_title, tf_genres, tf_actors, tfidf_plots])
recommender = Recommender(n_recommendations=9)
recommender.fit(concat)

In [15]:
movies = [14]

recommendation = recommender.recommend(movies)[0]

display(HTML('<h1>Now Viewing</h1>'))
display(HTML(movie_display.show([df.iloc[i] for i in movies])))
display(HTML('<h2>Similiar Movies</h2>'))
display(HTML(movie_display.show([df.iloc[i] for i in recommendation])))

### Let's try again

In [16]:
class Recommender_v2():
    SAMPLE_MULTIPLIER = 5

    def __init__(self, feature_zip, n_recommendations=10):
        # Check feature zip sizes
        if len(feature_zip) <= 0 or len(feature_zip[0]) != 2 or feature_zip[0][0].shape[0] <= 0:
            raise ValueError('Invalid feature_zip shape!')

        full_sample_size = feature_zip[0][0].shape[0]
        print('Finding similiarities between', full_sample_size, 'items...')

        # Sample size is 10 times the size of the number of recommendations that will be predicted
        sample_size = n_recommendations * self.SAMPLE_MULTIPLIER

        # Check sample size
        if full_sample_size < sample_size:
            raise ValueError('Too few samples!')

        # Create a knn to fit all features and get sample_size number of similiar items
        nbrs = NearestNeighbors(n_neighbors=sample_size, algorithm='auto', metric='cosine')
        self.graph = lil_matrix((full_sample_size, full_sample_size), dtype=np.float16)

        # For every feature
        for feature, weight in feature_zip:
            print('Analyzing feature with weight', weight, '...')
            # Get a graph of distances to the sample_size closest items
            distances, indices = nbrs.fit(feature).kneighbors(feature)
            temp = lil_matrix((full_sample_size, full_sample_size), dtype=np.float)
            for i, indices_for_item in enumerate(indices):
                # Weigh the distances
                temp[i, indices_for_item] = (1 - distances[i]) * weight
            self.graph += temp
        # Normalize by the sum of the weights
        print('Normalizing results...')
        self.graph /= sum([t[1] for t in feature_zip])
        print('Done!')

    def recommend(self, movie_index, n_recommendations=5):
        # We don't want to include the same movie so we exclude it
        return sorted(zip(self.graph.data[movie_index], self.graph.rows[movie_index]), key=lambda x: x[0], reverse=True)[1:n_recommendations + 1]

In [17]:
feature_zip = [
    (tfidf_plots, 1)
]

test_reco = Recommender_v2(feature_zip=feature_zip)

Finding similiarities between 9125 items...
Analyzing feature with weight 1 ...
Normalizing results...
Done!


In [18]:
# Lets get the N most similiar movies
movie = 3
best = sorted(zip(test_reco.graph.data[movie], test_reco.graph.rows[movie]), key=lambda x: x[0], reverse=True)
best[1:5]

[(0.24853515625, 4538),
 (0.24755859375, 6547),
 (0.2203369140625, 3091),
 (0.195068359375, 1524)]

In [19]:
# Testing with sparse matrices
a = lil_matrix((9125, 9125), dtype=np.float)
b = lil_matrix((9125, 9125), dtype=np.float)
b[[0], [0, 7556, 2506]] = 1 - np.array([.0, .2, .3])

a += b
a /= 2

print(a)

  (0, 0)	0.5
  (0, 2506)	0.35
  (0, 7556)	0.4


## Movie Recommendations v2

In [25]:
feature_zip = [
    (tf_title,    0.9),
    (tf_genres,   0.8),
    (tf_actors,   1.0),
    (tfidf_plots, 4.0),
    (tf_rated,    1.0)
]

recommender_v2 = Recommender_v2(feature_zip=feature_zip)

Finding similiarities between 9125 items...
Analyzing feature with weight 0.9 ...
Analyzing feature with weight 0.8 ...
Analyzing feature with weight 1.0 ...
Analyzing feature with weight 4.0 ...
Analyzing feature with weight 1.0 ...
Normalizing results...
Done!


In [26]:
movie = 0#14#1903#1000#2000#2801#1424#7842

recommendations = recommender_v2.recommend(movie, 12)
print(recommendations)
recommendations = [r[1] for r in recommendations] # get just the movie ids

display(HTML('<h1>Now Viewing</h1>'))
display(HTML(movie_display.show([df.iloc[movie]])))
display(HTML('<h2>Similiar Movies</h2>'))
display(HTML(movie_display.show([df.iloc[i] for i in recommendations])))

[(0.3449675324675325, 2506), (0.24984780844155843, 7556), (0.16030844155844157, 8437), (0.13811383928571427, 3829), (0.10387073863636363, 500), (0.10387073863636363, 1140), (0.10387073863636363, 1246), (0.10387073863636363, 1452), (0.10387073863636363, 1640), (0.10387073863636363, 2287), (0.10387073863636363, 2713), (0.10387073863636363, 3217)]


## App

In [None]:
# Recommender
recommender_v2 = Recommender_v2(feature_zip=[
    (tf_title,    1.8),
    (tf_genres,   2.0),
    (tf_actors,   2.0),
    (tfidf_plots, 4.0),
    (tf_rated,    1.0),
    (tf_years,    2.0)
])

Finding similiarities between 9125 items...
Analyzing feature with weight 1.8 ...
Analyzing feature with weight 2.0 ...
Analyzing feature with weight 2.0 ...


In [None]:
def on_movie_select(b):
    movie_preview.clear_output()
    with movie_preview:
        display(HTML(movie_display.show([df.iloc[dropdown_movie.value]])))

def recommend(b):
    movie_recommendations.clear_output()
    recommendations = recommender_v2.recommend(dropdown_movie.value, slider_reco_size.value)
    recommendations = [r[1] for r in recommendations] # get just the movie ids and not the scores
    with movie_recommendations:
        display(HTML(movie_display.show([df.iloc[i] for i in recommendations])))

def sort_movies(dropdown, tuples, alphabetically):
    dropdown.options = sorted(tuples, key=lambda x: x[0 if alphabetically else 1])

# Movie title and ids as data for the dropdown
movies_data = sorted(zip(df.Title, df.index))

# UI
dropdown_movie = w.Dropdown(description='Movie:')
dropdown_movie.options = movies_data

button_select = w.Button(description='Select')
button_select.on_click(on_movie_select)

slider_reco_size = w.IntSlider(min=0, max=20, value=10, description='Amount:')

button_recommend = w.Button(description='Recommend Movies')
button_recommend.on_click(recommend)

button_sort_dropdown_1 = w.Button(description="Sort Title")
button_sort_dropdown_1.disabled = True
button_sort_dropdown_1.on_click(lambda b: sort_movies(dropdown_movie, movies_data, True))
button_sort_dropdown_2 = w.Button(description="Sort Id")
button_sort_dropdown_2.disabled = True
button_sort_dropdown_2.on_click(lambda b: sort_movies(dropdown_movie, movies_data, False))

movie_preview = w.Output()
movie_recommendations = w.Output()

movie_selection = VBox([
    HBox([dropdown_movie, button_select, button_sort_dropdown_1, button_sort_dropdown_2]),
    HBox([slider_reco_size, button_recommend])
])

# Draw
app.clear_output()
with app:
    display(movie_selection)
    display(HTML('<h2>Now Viewing</h2>'))
    display(movie_preview)
    display(HTML('<h2>Watch List</h2>'))
    display(HTML('<h2>Recommendations</h2>'))
    display(movie_recommendations)

In [None]:
app