# Movie Recommendation

later

In [1]:
# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as scsp
from scipy.sparse import lil_matrix, csr_matrix, coo_matrix

# SKLearn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Display
from IPython.core.display import HTML
from movie_display import movie_display

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeroen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## IMDb JSON

Import movies into a data frame and analyze the availables attributes.

9125 Movies with 18 attributes from IMDb.

Attributes:

- Actors
- ...
- Plot
- Title
- Writer
- ...
- imdbVotes

In [2]:
# Load movies into a dataframe
df = pd.read_json('./dataset/imdbdata.json', orient='columns')
df.head(5)

Unnamed: 0,Actors,Awards,Country,Director,Genre,Language,Plot,Poster,Production,Rated,Released,Runtime,Title,Writer,Year,imdbId,imdbRating,imdbVotes
0,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",Nominated for 3 Oscars. Another 23 wins & 18 n...,USA,John Lasseter,"Animation, Adventure, Comedy",English,A cowboy doll is profoundly threatened and jea...,https://images-na.ssl-images-amazon.com/images...,Buena Vista,G,22 Nov 1995,81 min,Toy Story,"John Lasseter (original story by), Pete Docter...",1995,114709,8.3,666855
1,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",4 wins & 9 nominations.,USA,Joe Johnston,"Action, Adventure, Family","English, French",When two kids find and play a magical board ga...,https://images-na.ssl-images-amazon.com/images...,Sony Pictures Home Entertainment,PG,15 Dec 1995,104 min,Jumanji,"Jonathan Hensleigh (screenplay), Greg Taylor (...",1995,113497,6.9,223000
2,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",2 wins & 2 nominations.,USA,Howard Deutch,"Comedy, Romance",English,John and Max resolve to save their beloved bai...,https://images-na.ssl-images-amazon.com/images...,Warner Home Video,PG-13,22 Dec 1995,101 min,Grumpier Old Men,"Mark Steven Johnson (characters), Mark Steven ...",1995,113228,6.6,20100
3,"Whitney Houston, Angela Bassett, Loretta Devin...",8 wins & 8 nominations.,USA,Forest Whitaker,"Comedy, Drama, Romance",English,"Based on Terry McMillan's novel, this film fol...",https://images-na.ssl-images-amazon.com/images...,Twentieth Century Fox Home Entertainment,R,22 Dec 1995,124 min,Waiting to Exhale,"Terry McMillan (novel), Terry McMillan (screen...",1995,114885,5.7,7769
4,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Nominated for 1 Golden Globe. Another 1 win & ...,USA,Charles Shyer,"Comedy, Family, Romance",English,George Banks must deal not only with the pregn...,https://images-na.ssl-images-amazon.com/images...,Disney,PG,08 Dec 1995,106 min,Father of the Bride Part II,"Albert Hackett (screenplay), Frances Goodrich ...",1995,113041,5.9,27815


## Feature Extraction

Pipeline: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn

Excluding stopwords and stemming words according to the NLTK English language dictionaries.

### Stopwords

Frequent words with no meaning. Like: 'as', 'you', 'the'.

### Stemming

Useful but also transforms correct words into incorrect ones. Like `james` to `jame` and `territory` to `terri`.

### Features

- Plot
- Title
- Genre
- Actors
- Rated

In [3]:
features = df.copy()

# Get the stopwords for the English language
useless_words = stopwords.words('english')
# Create a stemmer for the English language
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print(useless_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [4]:
plots = []
genres = []
titles = []
actors = []
for row in df.itertuples():
    # Plot
    plots.append(' '.join([stemmer.stem(w) for w in row.Plot.split()]))
    
    # Title
    plots.append(' '.join([stemmer.stem(w) for w in row.Title.split()]))
    
    # Genre
    genres.append(' '.join(row.Genre.split(', ')))
    
    # Actor
    actors.append(' '.join([actor.replace(' ', '') for actor in row.Actors.split(', ')]))

features['PlotStripped'] = plots
features['TitleStripped'] = titles
features['GenreStripped'] = genres
features['ActorsStripped'] = actors

#### Count Vectorizer

Finding the frequencies of words in the plots using a bag of words approach and transforming the term count to term frequency.

We do not use this approach for the plot anymore because the `TfidfVectorizer` class does both steps in one. The data contains 17354 unique stemmed words in the plots (excluding stopwords).

In [5]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit(features.PlotStripped).transform(features.PlotStripped)

tfidf_transformer = TfidfTransformer()
tfidf_transformed = tfidf_transformer.fit_transform(bag_of_words)
print(tfidf_transformed.shape)

(9125, 17354)


One-hot encoding the genres using a count vectorizer. The data contains 26 unique genres.

In [6]:
vectorizer = CountVectorizer()
tf_genres = vectorizer.fit(features.GenreStripped).transform(features.GenreStripped)
print(tf_genres.shape)

(9125, 26)


In [7]:
vectorizer = CountVectorizer()
tf_title = vectorizer.fit(features.Title).transform(features.TitleStripped)
print(tf_title.shape)

(9125, 7291)


In [47]:
vectorizer = CountVectorizer()
tf_actors = vectorizer.fit(features.ActorsStripped).transform(features.ActorsStripped)
print(tf_actors.shape)

(9125, 16476)


In [48]:
vectorizer = CountVectorizer()
tf_rated = vectorizer.fit(features.Rated).transform(features.Rated)
print(tf_rated.shape)

(9125, 13)


#### Tf-Idf
Finding the frequencies of words in the plots directly using the Tf-Idf vectorizer.

In [9]:
vectorizer = TfidfVectorizer()
tfidf_plots = vectorizer.fit_transform(plots)

print("  (movie_index, word_index)\tfrequency\n")
print(tfidf_plots)

  (movie_index, word_index)	frequency

  (0, 3701)	0.28556953617885883
  (0, 4705)	0.30067463401543315
  (0, 12258)	0.3111177071772179
  (0, 15554)	0.22150792358457966
  (0, 8272)	0.2815858424310979
  (0, 10731)	0.14056017234989235
  (0, 14517)	0.375179319771497
  (0, 5898)	0.2730847991986979
  (0, 15076)	0.35904030100261597
  (0, 15699)	0.24046800457052322
  (0, 15763)	0.29240981532129123
  (0, 2152)	0.1748193225642691
  (0, 13300)	0.2549551797587505
  (1, 16924)	0.1639970766079029
  (1, 16054)	0.14545498193972273
  (1, 8601)	0.2381525345156728
  (1, 5924)	0.13694896029372178
  (1, 11872)	0.2114869352246071
  (1, 9476)	0.2414979882602379
  (1, 1995)	0.2612299602216739
  (1, 6393)	0.46912872186121635
  (1, 12853)	0.2512849236996872
  (1, 9562)	0.13931688865036346
  (1, 15833)	0.24708761079906302
  (1, 4116)	0.2845423608189828
  :	:
  (9123, 481)	0.1735077100896544
  (9123, 6406)	0.18484711756795755
  (9123, 1003)	0.1814252505152987
  (9123, 2971)	0.4184686576483837
  (9123, 5327)	0.209

## Movie Recommendations v1

In [10]:
class Recommender():

    def __init__(self, n_recommendations=5):
        self.nbrs = NearestNeighbors(n_neighbors=n_recommendations + 1, algorithm='auto', metric='cosine')
        self.n_recommendations = n_recommendations

    def fit(self, features):
        self.nbrs.fit(features)
        self.distances, self.graph = self.nbrs.kneighbors(features)

    def recommend(self, movie_index):
        # We don't want to include the same movie so we exclude it
        return self.graph[movie_index, 1:]

In [11]:
# Concatenate features
concat = scsp.hstack([tf_title, tf_genres, tf_actors, tfidf_plots])
recommender = Recommender(n_recommendations=9)
recommender.fit(concat)

In [12]:
movies = [14]

recommendation = recommender.recommend(movies)[0]

display(HTML('<h1>Now Viewing</h1>'))
display(HTML(movie_display.show([df.iloc[i] for i in movies])))
display(HTML('<h2>Similiar Movies</h2>'))
display(HTML(movie_display.show([df.iloc[i] for i in recommendation])))

### Let's try again

In [13]:
class Recommender_v2():
    SAMPLE_MULTIPLIER = 5

    def __init__(self, feature_zip, n_recommendations=10):
        # Check feature zip sizes
        if len(feature_zip) <= 0 or len(feature_zip[0]) != 2 or feature_zip[0][0].shape[0] <= 0:
            raise ValueError('Invalid feature_zip shape!')

        full_sample_size = feature_zip[0][0].shape[0]
        print('Finding similiarities between', full_sample_size, 'items...')

        # Sample size is 10 times the size of the number of recommendations that will be predicted
        sample_size = n_recommendations * self.SAMPLE_MULTIPLIER

        # Check sample size
        if full_sample_size < sample_size:
            raise ValueError('Too few samples!')

        # Create a knn to fit all features and get sample_size number of similiar items
        nbrs = NearestNeighbors(n_neighbors=sample_size, algorithm='auto', metric='cosine')
        self.graph = lil_matrix((full_sample_size, full_sample_size), dtype=np.float16)

        # For every feature
        for feature, weight in feature_zip:
            print('Analyzing feature with weight', weight, '...')
            # Get a graph of distances to the sample_size closest items
            distances, indices = nbrs.fit(feature).kneighbors(feature)
            temp = lil_matrix((full_sample_size, full_sample_size), dtype=np.float)
            for i, indices_for_item in enumerate(indices):
                # Weigh the distances
                temp[i, indices_for_item] = (1 - distances[i]) * weight
            self.graph += temp
        # Normalize by the sum of the weights
        print('Normalizing results...')
        self.graph /= sum([t[1] for t in feature_zip])
        print('Done!')

    def recommend(self, movie_index, n_recommendations=5):
        # We don't want to include the same movie so we exclude it
        return sorted(zip(self.graph.data[movie_index], self.graph.rows[movie_index]), key=lambda x: x[0], reverse=True)[1:n_recommendations + 1]

In [14]:
feature_zip = [
    (tfidf_plots, 1)
]

test_reco = Recommender_v2(feature_zip=feature_zip)

Finding similiarities between 9125 items...
Analyzing feature with weight 1 ...
Normalizing results...
Done!


In [15]:
# Lets get the N most similiar movies
movie = 3
best = sorted(zip(test_reco.graph.data[movie], test_reco.graph.rows[movie]), key=lambda x: x[0], reverse=True)
best[1:5]

[(0.2403564453125, 3091),
 (0.2061767578125, 1524),
 (0.1982421875, 4538),
 (0.1839599609375, 1244)]

In [16]:
# Testing with sparse matrices
a = lil_matrix((9125, 9125), dtype=np.float)
b = lil_matrix((9125, 9125), dtype=np.float)
b[[0], [0, 7556, 2506]] = 1 - np.array([.0, .2, .3])

a += b
a /= 2

print(a)

  (0, 0)	0.5
  (0, 2506)	0.35
  (0, 7556)	0.4


## Movie Recommendations v2

In [67]:
feature_zip = [
    (tf_title,    1.0),
    (tf_genres,   0.8),
    (tf_actors,   1.0),
    (tfidf_plots, 4.0),
    (tf_rated,    1.0)
]

recommender_v2 = Recommender_v2(feature_zip=feature_zip)

Finding similiarities between 9125 items...
Analyzing feature with weight 1.0 ...
Analyzing feature with weight 0.8 ...
Analyzing feature with weight 1.0 ...
Analyzing feature with weight 4.0 ...
Analyzing feature with weight 1.0 ...
Normalizing results...
Done!


In [75]:
movie = 0#1903#1000#2000#2801#1424#7842

recommendations = recommender_v2.recommend(movie, 15)
print(recommendations)
recommendations = [r[1] for r in recommendations] # get just the movie ids

display(HTML('<h1>Now Viewing</h1>'))
display(HTML(movie_display.show([df.iloc[movie]])))
display(HTML('<h2>Similiar Movies</h2>'))
display(HTML(movie_display.show([df.iloc[i] for i in recommendations])))

[(0.2949719551282051, 2506), (0.26742788461538464, 7556), (0.15474759615384615, 8437), (0.11355669070512821, 3829), (0.1025390625, 500), (0.1025390625, 1140), (0.1025390625, 1246), (0.1025390625, 1452), (0.1025390625, 1640), (0.1025390625, 2287), (0.1025390625, 2713), (0.1025390625, 3217), (0.1025390625, 3805), (0.1025390625, 3979), (0.1025390625, 4002)]
