In [1]:
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

X_file = r'dataset/X.csv'
X = pd.read_csv(X_file)

Y_file = r'dataset/Y.csv'
Y = pd.read_csv(Y_file)
Y = np.reshape(Y.values, [Y.shape[0],])

pd.set_option('display.max_columns', len(X.columns))
X.shape, Y.shape

((3874, 25), (3874,))

In [2]:
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import HashingVectorizer
N = 20

mapper = DataFrameMapper([
    ('adult', None),
    ('belongs_to_collection', HashingVectorizer(n_features=N)), 
    ('budget', None),
    ('genres', HashingVectorizer(n_features=N)),
    ('homepage', None),
    ('overview', HashingVectorizer(n_features=N)),
    ('popularity', None),
    ('production_companies', HashingVectorizer(n_features=N)),
    ('production_countries', HashingVectorizer(n_features=N)),
#    ('release_date', None),
#     ('revenue', None), 
    ('runtime', None),
    ('spoken_languages', None),
    ('tagline', HashingVectorizer(n_features=N)), 
    ('title', HashingVectorizer(n_features=N)),
    ('vote_average', None),
    ('vote_count', None),
    ('cast', HashingVectorizer(n_features=N)),
    ('keywords', HashingVectorizer(n_features=N)),
    ('cast_size', None),
    ('crew_size', None),
    ('director', HashingVectorizer(n_features=N)),
    ('producers', HashingVectorizer(n_features=N)),
    ('executive_producers', HashingVectorizer(n_features=N)),
])

X.fillna('', inplace=True) # can't have nan in any of the columns

features = mapper.fit_transform(X)
features.shape

(3874, 250)

In [10]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(features)

X = scaler.transform(features)

parameters = {
#     learning_rate only used when solver='sgd'
    'learning_rate': ["constant", "invscaling", "adaptive"],
#     'hidden_layer_sizes': [(100,1), (100,2), (100,3)],
#     'alpha': [10.0 ** -np.arange(1, 7)],
    'activation': ['identity', "logistic", "relu", "tanh"]
}

clf = RandomizedSearchCV(
    estimator=MLPClassifier(solver='sgd'),
    param_distributions=parameters,
    n_jobs=3,
    cv=StratifiedKFold(y=Y, n_folds=5)
)

In [11]:
clf.fit(X, Y)

print("------ RESULTS --------")
print("best score: {0}".format(clf.best_score_))
print("best_estimator: {0}".format(clf.best_estimator_.C))



------ RESULTS --------
best score: 0.728446050594




AttributeError: 'MLPClassifier' object has no attribute 'C'

In [None]:
# ignore below until we train up to 90-95% accuracy

In [166]:
X_tr, X_ts, Y_tr, Y_ts = train_test_split(features, Y, train_size = 0.7)
X_tr.shape, X_ts.shape, Y_tr.shape, Y_ts.shape

((2711, 8), (1163, 8), (2711, 1), (1163, 1))

In [168]:
# from sklearn.neural_network import MLPClassifier

num_neurons = features.shape[1]
num_iterations = 500

# 3 layers for now
mlp = MLPClassifier(hidden_layer_sizes=(num_neurons, num_neurons, num_neurons), max_iter=num_iterations)
mlp.fit(X_tr_sc, Y_tr)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(8, 8, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [169]:
predictions = mlp.predict(X_ts_sc)

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(Y_ts, predictions))

print(classification_report(Y_ts, predictions))

[[169 171]
 [ 89 734]]
             precision    recall  f1-score   support

      False       0.66      0.50      0.57       340
       True       0.81      0.89      0.85       823

avg / total       0.77      0.78      0.77      1163



In [170]:
from sklearn.metrics import accuracy_score


precision = accuracy_score(predictions, Y_ts) * 100
print("Accuracy: {0:.2f}%".format(precision))

Accuracy: 77.64%
