# Movie Recommendation

later

In [52]:
# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SKLearn
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Display
from IPython.core.display import HTML
from movie_display import movie_display

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Import movies into a data frame and analyze the availables attributes.

## IMDb JSON

9125 Movies with 18 attributes from IMDb.

Attributes:

- Actors
- ...
- Plot
- Title
- Writer
- ...
- imdbVotes

In [3]:
# Load movies into a dataframe
df = pd.read_json('./dataset/imdbdata.json', orient='columns')
df.head(2)

Unnamed: 0,Actors,Awards,Country,Director,Genre,Language,Plot,Poster,Production,Rated,Released,Runtime,Title,Writer,Year,imdbId,imdbRating,imdbVotes
0,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",Nominated for 3 Oscars. Another 23 wins & 18 n...,USA,John Lasseter,"Animation, Adventure, Comedy",English,A cowboy doll is profoundly threatened and jea...,https://images-na.ssl-images-amazon.com/images...,Buena Vista,G,22 Nov 1995,81 min,Toy Story,"John Lasseter (original story by), Pete Docter...",1995,114709,8.3,666855
1,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",4 wins & 9 nominations.,USA,Joe Johnston,"Action, Adventure, Family","English, French",When two kids find and play a magical board ga...,https://images-na.ssl-images-amazon.com/images...,Sony Pictures Home Entertainment,PG,15 Dec 1995,104 min,Jumanji,"Jonathan Hensleigh (screenplay), Greg Taylor (...",1995,113497,6.9,223000


### Feature Extraction

Pipeline: https://stackoverflow.com/questions/36182502/add-stemming-support-to-countvectorizer-sklearn

Excluding stopwords and stemming words according to the NLTK English language dictionaries.

#### Stopwords

#### Stemming

Useful but also transforms correct words into incorrect ones. Like `james` to `jame` and `territory` to `terri`.

In [6]:
features = df.copy()

# Get the stopwords for the English language
useless_words = stopwords.words('english')
# Create a stemmer for the English language
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print(useless_words[:10])

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're"]


In [15]:
plots = []
for sentence in features.Plot:
    words = sentence.lower().split()
    words = [stemmer.stem(w) for w in words if w not in useless_words]
    plots.append(' '.join(words))

features['PlotStripped'] = plots

  (0, 2150)	1
  (0, 3699)	1
  (0, 4702)	1
  (0, 5894)	1
  (0, 8268)	1
  (0, 10725)	1
  (0, 12250)	1
  (0, 13292)	1
  (0, 14508)	1
  (0, 15067)	1
  (0, 15544)	1
  (0, 15689)	1
  (0, 15753)	1
  (1, 1993)	1
  (1, 3989)	1
  (1, 4114)	1
  (1, 5920)	1
  (1, 5926)	1
  (1, 6389)	2
  (1, 7485)	1
  (1, 8596)	1
  (1, 9471)	1
  (1, 9557)	1
  (1, 11864)	1
  (1, 12845)	1
  :	:
  (9123, 11433)	1
  (9123, 12331)	1
  (9123, 13105)	1
  (9123, 14155)	1
  (9123, 15469)	1
  (9123, 16746)	1
  (9123, 17114)	1
  (9124, 151)	1
  (9124, 160)	1
  (9124, 296)	1
  (9124, 3081)	1
  (9124, 3139)	1
  (9124, 3266)	1
  (9124, 5645)	1
  (9124, 6717)	1
  (9124, 7312)	1
  (9124, 8051)	1
  (9124, 11074)	1
  (9124, 11646)	1
  (9124, 12012)	1
  (9124, 12707)	1
  (9124, 14708)	1
  (9124, 16002)	1
  (9124, 17083)	1
  (9124, 17221)	1


##### Count Vectorizer
Finding the frequencies of words in the plots using a bag of words approach and transforming the term count to term frequency.

In [None]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit(plots).transform(plots)

tfidf_transformer = TfidfTransformer()
tfidf_transformed = tfidf_transformer.fit_transform(bag_of_words)
print(tfidf_transformed)

##### Tf-Idf
Finding the frequencies of words in the plots directly using the Tf-Idf vectorizer.

In [43]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(plots)

print("  (movie_index, word_index)\tfrequency\n")
print(tfidf)

  (movie_index, word_index)	frequency

  (0, 3699)	0.285569536179
  (0, 4702)	0.300674634015
  (0, 12250)	0.311117707177
  (0, 15544)	0.221507923585
  (0, 8268)	0.281585842431
  (0, 10725)	0.14056017235
  (0, 14508)	0.375179319771
  (0, 5894)	0.273084799199
  (0, 15067)	0.359040301003
  (0, 15689)	0.240468004571
  (0, 15753)	0.292409815321
  (0, 2150)	0.174819322564
  (0, 13292)	0.254955179759
  (1, 16044)	0.147451351623
  (1, 8596)	0.241421178144
  (1, 5920)	0.138828584827
  (1, 11864)	0.214389593492
  (1, 9471)	0.244812548242
  (1, 1993)	0.264815341526
  (1, 6389)	0.47556751375
  (1, 12845)	0.254733809374
  (1, 9557)	0.141229013001
  (1, 15823)	0.250478888353
  (1, 4114)	0.288447704831
  (1, 7485)	0.267767436411
  :	:
  (9123, 481)	0.17350771009
  (9123, 6402)	0.184847117568
  (9123, 1002)	0.181425250515
  (9123, 2969)	0.418468657648
  (9123, 5323)	0.209234328824
  (9123, 2172)	0.418468657648
  (9123, 483)	0.209234328824
  (9124, 16002)	0.157795382911
  (9124, 17083)	0.184077704398
 

## Feature vectors

In [113]:
def recommend(movie_index, topN):
    nbrs = NearestNeighbors(n_neighbors=topN+1, algorithm='auto', metric='cosine').fit(tfidf)
    distances, indices = nbrs.kneighbors(tfidf)
        
    # We don't want to include the same movie so we exclude it. 
    return indices[0,1:]

## Recommend movies

In [114]:
recommendation = recommend(0, 5)

In [115]:
HTML(movie_display.show([df.iloc[i] for i in recommendation]))