In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
cols = [0, 1, 5, 7, 11, 12, 13]+[i for i in range(16, 33)]
train_data = pd.read_csv('train.csv', header=0, usecols=cols)
test_data = pd.read_csv('test.csv', header=0, usecols=cols[:-1])

In [3]:
# replicate data because label 1 only has 1 entry
train_data = train_data.append([train_data[train_data.label == 1]]*9, ignore_index=True)

In [4]:
def find_best_gbc_model(X, y):
    X = X[['num-comments', 'feedback-karma', 'ratings-given', 'ratings-received',
           'num-authors', 'prev-games', 'fun-average', 'innovation-average', 'theme-average',
           'graphics-average', 'audio-average', 'humor-average', 'mood-average', 'fun-rank',
           'innovation-rank', 'theme-rank', 'graphics-rank', 'audio-rank', 'humor-rank', 'mood-rank']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    random_grid = {
        'n_estimators': [300, 400, 500, 600],
        'learning_rate' : [0.25, 0.1, 0.05],
        'max_depth': [4, 5, 6, 7, 8],
        'min_samples_split': [0.2, 0.3, 0.4, 0.5],
        'min_samples_leaf': [0.1, 0.2, 0.3],
        'max_features': ['sqrt']
    }
    gbc_random = RandomizedSearchCV(estimator = GradientBoostingClassifier(), 
                                   param_distributions = random_grid, n_iter = 200,
                                   cv = 3, verbose=2, random_state=42, n_jobs = -1).fit(X_train, y_train)
    print(gbc_random.best_params_)
    print('training accuracy:', gbc_random.score(X_train, y_train))
    print('test accuracy:', gbc_random.score(X_test, y_test))

In [5]:
find_best_gbc_model(train_data, train_data['label'])

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 24.1min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 58.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 102.7min finished


{'n_estimators': 400, 'min_samples_split': 0.3, 'min_samples_leaf': 0.1, 'max_features': 'sqrt', 'max_depth': 6, 'learning_rate': 0.05}
training accuracy: 0.957756902931967
test accuracy: 0.9439890710382514


In [7]:
def fit_best_gbc_model(X, y, params, cv=False):
    X = X[['num-comments', 'feedback-karma', 'ratings-given', 'ratings-received',
       'num-authors', 'prev-games', 'fun-average', 'innovation-average', 'theme-average',
       'graphics-average', 'audio-average', 'humor-average', 'mood-average', 'fun-rank',
       'innovation-rank', 'theme-rank', 'graphics-rank', 'audio-rank', 'humor-rank', 'mood-rank']]
    if cv:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
        gbc = GradientBoostingClassifier(**params).fit(X_train, y_train)
        print('training accuracy:', gbc.score(X_train, y_train))
        print('testing accuracy:', gbc.score(X_test, y_test))
    else:
        gbc = GradientBoostingClassifier(**params).fit(X, y)
        print('training accuracy:', gbc.score(X, y))
    return gbc

In [33]:
params = {
    'n_estimators': 500,
    'min_samples_split': 0.2,
    'min_samples_leaf': 0.1,
    'max_features': 'sqrt',
    'max_depth': 6.0,
    'learning_rate': 0.05
}
model = fit_best_gbc_model(train_data, train_data['label'], params, cv=False)

training accuracy: 0.9582820968256137


In [29]:
X2 = test_data[['num-comments', 'feedback-karma', 'ratings-given', 'ratings-received', 
                'num-authors', 'prev-games', 'fun-average', 'innovation-average', 'theme-average', 
                'graphics-average', 'audio-average', 'humor-average', 'mood-average', 'fun-rank', 
                'innovation-rank', 'theme-rank', 'graphics-rank', 'audio-rank', 'humor-rank', 'mood-rank']]

In [30]:
predictions = model.predict(X2)

In [31]:
result = pd.DataFrame({'id': test_data.id, 'label': predictions}).round().astype('int32')

In [32]:
result.to_csv(f'submission_25.csv', index=False)