In [1]:
from pandas import DataFrame, read_csv
from helpers.preprocess import preprocess_data
from sklearn.metrics import accuracy_score
from helpers.training import TrainingData
from helpers.optimize import optimize_for_clf

import pandas as pd 
import numpy as np

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

def test_gbc(x_tr, x_ts, y_tr, y_ts):
    y_tr = y_tr.values.ravel()
    y_ts = y_ts.values.ravel()
    gbt = GradientBoostingClassifier(max_features="log2")
    gbt.fit(x_tr, y_tr)

    p = gbt.predict(x_ts)

    accuracy = accuracy_score(p, y_ts) * 100
    return accuracy
    

In [6]:
from sklearn.ensemble import RandomForestClassifier

def test_rfc(x_tr, x_ts, y_tr, y_ts):
    y_tr = y_tr.values.ravel()
    y_ts = y_ts.values.ravel()
    rft = RandomForestClassifier()
    rft.fit(x_tr, y_tr)

    p = rft.predict(x_ts)

    accuracy = accuracy_score(p, y_ts) * 100
    return accuracy

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

def test_svm(x_tr, x_ts, y_tr, y_ts):
    def svc_param_selection(X, y, jobs):
        params = {'C': stats.uniform(0, 10),
              'gamma': stats.uniform(0, 1)}
        rand_search = RandomizedSearchCV(SVC(),
                                         param_distributions=params,
                                         n_jobs=jobs,
                                         random_state=2017)
        rand_search.fit(X, y)
        print(rand_search.best_params_)
        return rand_search.best_params_

    best_params = svc_param_selection(x_tr, y_tr, -1)

    svc = SVC(C=best_params['C'], gamma=best_params['gamma'])
    svc.fit(x_tr, y_tr)

    p = svc.predict(x_ts)

    accuracy = accuracy_score(p, y_ts) * 100
#     print("Accuracy using SVC: {0:.2f}%".format(accuracy))
    return accuracy

In [3]:
movies_md = r'dataset/movies_tmdbMeta.csv'
og_movies_md_df = pd.read_csv(movies_md)

In [2]:
# currently, running PCA makes it worse - need to investigate why
# from sklearn.decomposition import PCA

# # figure out n_components later
# pca = PCA(n_components=20)

# X_tr = pca.fit_transform(training_data.X_tr)
# X_ts = pca.fit_transform(training_data.X_ts)

In [10]:
gbc_best = optimize_for_clf(og_movies_md_df, test_gbc)
print('Best performance with gbc: {0}'.format(gbc_best))

New best: {'num_vectorizer_features': 5, 'min_vote_count': 100, 'backfill_method': 'mean', 'accuracy': 60.193433443624336}




KeyboardInterrupt: 

In [None]:
rfc_best = optimize_for_clf(og_movies_md_df, test_rfc)
print('Best performance with rfc: {0}'.format(rfc_best))

In [26]:
svm_best = optimize_for_clf(og_movies_md_df, test_svm)
print('Best performance with svm: {0}'.format(svm_best))

{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
New best: {'num_vectorizer_features': 5, 'min_vote_count': 100, 'backfill_method': 'mean', 'accuracy': 43.573428353270558}


{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 0.12054161556730447}
{'C': 4.4791979980060219, 'gamma': 

KeyboardInterrupt: 