In [2]:
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

X_file = r'dataset/X.csv'
X = pd.read_csv(X_file)

Y_file = r'dataset/Y.csv'
Y = pd.read_csv(Y_file)

pd.set_option('display.max_columns', len(X.columns))
X.shape, Y.shape

((3874, 25), (3874, 1))

In [71]:
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import HashingVectorizer
N = 20

mapper = DataFrameMapper([
    ('adult', None),
    ('belongs_to_collection', HashingVectorizer(n_features=N)), 
    ('budget', None),
    ('genres', HashingVectorizer(n_features=N)),
    ('homepage', None),
    ('overview', HashingVectorizer(n_features=N)),
    ('popularity', None),
    ('production_companies', HashingVectorizer(n_features=N)),
    ('production_countries', HashingVectorizer(n_features=N)),
#    ('release_date', None),
#     ('revenue', None), 
    ('runtime', None),
    ('spoken_languages', None),
    ('tagline', HashingVectorizer(n_features=N)), 
    ('title', HashingVectorizer(n_features=N)),
    ('vote_average', None),
    ('vote_count', None),
    ('cast', HashingVectorizer(n_features=N)),
    ('keywords', HashingVectorizer(n_features=N)),
    ('cast_size', None),
    ('crew_size', None),
    ('director', HashingVectorizer(n_features=N)),
    ('producers', HashingVectorizer(n_features=N)),
    ('executive_producers', HashingVectorizer(n_features=N)),
])

X.fillna('', inplace=True) # can't have nan in any of the columns

features = mapper.fit_transform(X)
features.shape

(3874, 250)

In [72]:
X_tr, X_ts, Y_tr, Y_ts = train_test_split(features, Y, train_size = 0.7)
X_tr.shape, X_ts.shape, Y_tr.shape, Y_ts.shape

((2711, 250), (1163, 250), (2711, 1), (1163, 1))

In [81]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_tr)

X_tr_sc = scaler.transform(X_tr)
X_ts_sc = scaler.transform(X_ts)

In [103]:
from sklearn.neural_network import MLPClassifier

num_neurons = features.shape[1]
num_iterations = 5000

# 3 layers for now
mlp = MLPClassifier(hidden_layer_sizes=(num_neurons, num_neurons, num_neurons), max_iter=num_iterations)
mlp.fit(X_tr_sc, Y_tr)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(250, 250, 250), learning_rate='constant',
       learning_rate_init=0.001, max_iter=5000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [104]:
predictions = mlp.predict(X_ts_sc)

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(Y_ts, predictions))

print(classification_report(Y_ts, predictions))

[[144 166]
 [157 696]]
             precision    recall  f1-score   support

      False       0.48      0.46      0.47       310
       True       0.81      0.82      0.81       853

avg / total       0.72      0.72      0.72      1163



In [105]:
from sklearn.metrics import accuracy_score


precision = accuracy_score(predictions, Y_ts) * 100
print("Accuracy: {0:.2f}%".format(precision))

Accuracy: 72.23%
