In [11]:
from pandas import DataFrame, read_csv
from helpers.preprocess import preprocess_data
from sklearn.metrics import accuracy_score
from helpers.training import TrainingData

import pandas as pd 
import numpy as np

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

def test_gbc(x_tr, x_ts, y_tr, y_ts):
    gbt = GradientBoostingClassifier(max_features="log2")
    gbt.fit(x_tr, y_tr)

    p = gbt.predict(x_ts)

    accuracy = accuracy_score(p, y_ts) * 100
    return accuracy
    

In [17]:
from sklearn.ensemble import RandomForestClassifier

def test_rfc(x_tr, x_ts, y_tr, y_ts):
    y_tr = y_tr.values.ravel()
    y_ts = y_ts.values.ravel()
    rft = RandomForestClassifier()
    rft.fit(x_tr, y_tr)

    p = rft.predict(x_ts)

    accuracy = accuracy_score(p, y_ts) * 100
    return accuracy

In [14]:
movies_md = r'dataset/movies_tmdbMeta.csv'
og_movies_md_df = pd.read_csv(movies_md)

In [2]:
# currently, running PCA makes it worse - need to investigate why
# from sklearn.decomposition import PCA

# # figure out n_components later
# pca = PCA(n_components=20)

# X_tr = pca.fit_transform(training_data.X_tr)
# X_ts = pca.fit_transform(training_data.X_ts)

In [20]:
def optimize_for_clf(method):
    min_vote_count_range = [100, 500, 1000, 5000]
    backfill_method_options = ['mean', 'median', 'mode']
    num_vectorizer_features_range = [5, 10, 20]

    best = {
        'accuracy': 0
    }

    for min_vote_count in min_vote_count_range:
        for backfill_method in backfill_method_options:
            df = preprocess_data(
                og_movies_md_df, 
                min_vote_count, 
                backfill_method
            )

            y = df[['rating']]
            x = df.drop(['rating'], 1)

            for num_vectorizer_features in num_vectorizer_features_range:
                training_data = TrainingData(
                    X_df=x, 
                    Y_df=y, 
                    num_vectorizer_features=num_vectorizer_features
                )

                accuracy = method(
                    training_data.X_tr,
                    training_data.X_ts,
                    training_data.Y_tr,
                    training_data.Y_ts
                )

                if accuracy > best['accuracy']:
                    best['min_vote_count'] = min_vote_count
                    best['backfill_method'] = backfill_method
                    best['num_vectorizer_features'] = num_vectorizer_features                
                    best['accuracy'] = accuracy
                    print('New best: {0}\n\n'.format(best))

    return best

In [19]:
gbc_best = optimize_for_clf(test_gbc)
print('Best performance with gbc: {0}'.format(gbc_best))

KeyboardInterrupt: 

In [None]:
rfc_best = optimize_for_clf(test_rfc)
print('Best performance with rfc: {0}'.format(rfc_best))

New best: {'num_vectorizer_features': 5, 'min_vote_count': 100, 'backfill_method': 'mean', 'accuracy': 56.401119877831505}




In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

def svc_param_selection(X, y, jobs):
    params = {'C': stats.uniform(0, 10),
          'gamma': stats.uniform(0, 1)}
    rand_search = RandomizedSearchCV(SVC(),
                                     param_distributions=params,
                                     n_jobs=jobs,
                                     random_state=2017)
    rand_search.fit(X, y)
    print(rand_search.best_params_)
    return rand_search.best_params_

best_params = svc_param_selection(training_data.X_tr, training_data.Y_tr, -1)

svc = SVC(C=best_params['C'], gamma=best_params['gamma'])
svc.fit(training_data.X_tr, training_data.Y_tr)

p = svc.predict(training_data.X_ts)

precision = accuracy_score(p, training_data.Y_ts) * 100
print("Accuracy using SVC: {0:.2f}%".format(precision))

KeyboardInterrupt: 