In [178]:
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split

X_file = r'dataset/X.csv'
X = pd.read_csv(X_file)

Y_file = r'dataset/Y.csv'
Y = pd.read_csv(Y_file)

pd.set_option('display.max_columns', len(X.columns))

In [179]:
X.shape, Y.shape

((3874, 25), (3874, 1))

In [180]:
from sklearn_pandas import DataFrameMapper
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

N = 25

mapper = DataFrameMapper([
    ('adult', None),
    ('belongs_to_collection', HashingVectorizer(n_features=N)), 
#    ('budget', None),
    ('genres', HashingVectorizer(n_features=N)),
    ('homepage', None),
    ('overview', HashingVectorizer(n_features=N)),
    ('popularity', None),
    ('production_companies', HashingVectorizer(n_features=N)),
    ('production_countries', HashingVectorizer(n_features=N)),
#    ('release_date', None),
#    ('revenue', None), 
    ('runtime', None),
    ('spoken_languages', None),
    ('tagline', HashingVectorizer(n_features=N)), 
    ('title', HashingVectorizer(n_features=N)),
    ('vote_average', None),
    ('vote_count', None),
    ('cast', HashingVectorizer(n_features=N)),
    ('keywords', HashingVectorizer(n_features=N)),
    ('cast_size', None),
    ('crew_size', None),
    ('director', HashingVectorizer(n_features=N)),
    ('producers', HashingVectorizer(n_features=N)),
    ('executive_producers', HashingVectorizer(n_features=N)),
], input_df=True)

X.fillna('', inplace=True) # can't have nan in any of the columns

features = mapper.fit_transform(X)

In [181]:
X_tr, X_ts, Y_tr, Y_ts = train_test_split(features, Y, train_size = 0.7)

In [182]:
X_tr.shape, X_ts.shape, Y_tr.shape, Y_ts.shape

((2711, 309), (1163, 309), (2711, 1), (1163, 1))

In [183]:
from sklearn.decomposition import PCA
pca = PCA()

X_tr = pca.fit_transform(X_tr)
X_ts = pca.fit_transform(X_ts)

In [184]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gbt = GradientBoostingClassifier(max_features="log2")
gbt.fit(X_tr, Y_tr)

p = gbt.predict(X_ts)

precision = accuracy_score(p, Y_ts) * 100
print("Accuracy using GB: {0:.2f}%".format(precision))

Accuracy using GB: 71.45%


In [185]:
from sklearn.ensemble import RandomForestClassifier

rft = RandomForestClassifier()
rft.fit(X_tr, Y_tr)

p = rft.predict(X_ts)

precision = accuracy_score(p, Y_ts) * 100
print("Accuracy using RF: {0:.2f}%".format(precision))

  after removing the cwd from sys.path.


Accuracy using RF: 68.36%


In [186]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

def svc_param_selection(X, y, n_folds, n_jobs):
    params = {'C': stats.uniform(0, 10),
          'gamma': stats.uniform(0, 1)}
    rand_search = RandomizedSearchCV(SVC(),
                                     param_distributions=params,
                                     cv=n_folds,
                                     n_jobs=jobs,
                                     random_state=2017)
    rand_search.fit(X, y)
    print(rand_search.best_params_)
    return rand_search.best_params_

best_params = svc_param_selection(features, Y, 3, 4)

svc = SVC(C=best_params['C'], gamma=best_params['gamma'])
svc.fit(X_tr, Y_tr)

p = svc.predict(X_ts)

precision = accuracy_score(p, Y_ts) * 100
print("Accuracy using SVC: {0:.2f}%".format(precision))

{'C': 0.20960225406117416, 'gamma': 0.7670701646824878}
Accuracy using SVC: 72.14%
