In [1]:
import numpy as np
import pandas as pd
import sys
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
gdata = pd.read_json('data/genres.json.gz', orient='record', lines=True)
odata = pd.read_json('data/omdb-data.json.gz', orient='record', lines=True)
rdata = pd.read_json('data/rotten-tomatoes.json.gz', orient='record', lines=True)
wdata = pd.read_json('data/wikidata-movies.json.gz', orient='record', lines=True)

In [3]:
# Join wdata and odata by: first set 'imdb_id' as index then join on 'imdb_id'
# We need column 'omdb_plot' from odata
data = wdata.join(odata.set_index('imdb_id'), on='imdb_id')

In [4]:
# Drop 'rotten_tomatoes_id' column to avoid conflict on joining
# (The above join already contains column named 'rotten_tomatoes_id')
rdata = rdata.drop(['rotten_tomatoes_id'], axis=1)

In [5]:
# Join data and rdata similar to above:
# We need 'audience_average' and 'critic_average' from rdata
data = data.join(rdata.set_index('imdb_id'), on='imdb_id')

In [6]:
# Select only the columns needed (plot => audience_average)
data = data[['imdb_id', 'enwiki_title', 'omdb_plot', 'audience_average', 'critic_average']]

In [7]:
data.size

202150

In [8]:
# Drop empty plots
# Plots marked as NaN have data type 'float', non-empty plots have data type 'str'
data['is_empty_plot'] = data['omdb_plot'].map(lambda plot: type(plot) == float)

In [9]:
# Drop empty plots cont'd
data = data[data['is_empty_plot'] == False]

In [10]:
# Select only the rows needed by: Drop NaN review averages
data = data[np.isnan(data['audience_average']) == False]

In [11]:
# Select only the rows needed by: Drop NaN review averages
data = data[np.isnan(data['critic_average']) == False]

In [12]:
data.size

39582

In [13]:
# Round review averages and convert them to string: to be used as class labels
data['audience_average'] = data['audience_average'].round()

In [14]:
# Rounding cont'd: convert to int to trim decimals, then convert to str
data['audience_average'] = data['audience_average'].astype('int').astype('str')

In [15]:
X = data['omdb_plot'].values

In [16]:
y = data['audience_average'].values

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [18]:
bayes_model = make_pipeline(
    CountVectorizer(),
    TfidfTransformer(use_idf=True),
    MultinomialNB(),
)

In [19]:
bayes_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidftransformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('multinomialnb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [20]:
# Score with training data: I got 0.876...
bayes_model.score(X_train, y_train)

0.868000808570851

In [21]:
# Score with validation data: I got 0.598... which is too low
bayes_model.score(X_valid, y_valid)

0.5903030303030303

In [22]:
# SVM to be implemented below