In [1]:
import numpy as np
np.random.seed(27)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
X = np.load('../data/source/x_test.npy')
Y = np.load('../data/source/y_test.npy')

In [3]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.9, random_state=27)
x_val, x_train, y_val, y_train = train_test_split(x_train, y_train, stratify=y_train, test_size=0.3, random_state=27)
print(np.bincount(y_test.astype('int32')))
print(np.bincount(y_val.astype('int32')))
print(np.bincount(y_train.astype('int32')))

[270000 270000 270000 270000]
[21000 21000 21000 21000]
[9000 9000 9000 9000]


In [4]:
x_train = x_train.reshape(x_train.shape[0], -1)
x_val = x_val.reshape(x_val.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)
print(x_train.shape, x_val.shape, x_test.shape)

(36000, 70) (84000, 70) (1080000, 70)


# SVM

In [20]:
Cs = [1, 10, 100]
kernels = ['poly', 'rbf']
degrees = [5, 7, 9]
gammas = ['scale'] # this the best, have run before

In [21]:
best = {"C":None, "kernel":None, "degree":None, "gamma":None}
best_score = 0.0

In [22]:
for C in Cs:
    for kernel in kernels:
        for gamma in gammas:
            if kernel == 'poly':
                for degree in degrees:
                    cls = SVC(C=C, kernel=kernel, gamma=gamma, degree=degree, cache_size=10000, random_state=27)
                    _ = cls.fit(x_train, y_train)
                    score = cls.score(x_val, y_val)
                    print("%s, %s, %s, %s ==> score: %f" %(C, kernel, gamma, degree, score))
                    if score > best_score:
                        best_score = score
                        best['C'] = C
                        best['kernel'] = kernel
                        best['degree'] = degree
                        best['gamma'] = gamma
            else:
                cls = SVC(C=C, kernel=kernel, gamma=gamma, cache_size=10000, random_state=27)
                _ = cls.fit(x_train, y_train)
                score = cls.score(x_val, y_val)
                print("%s, %s, %s ==> score: %f" %(C, kernel, gamma, score))
                if score > best_score:
                    best_score = score
                    best['C'] = C
                    best['kernel'] = kernel
                    best['degree'] = None
                    best['gamma'] = gamma

1, poly, scale, 5 ==> score: 0.971786
1, poly, scale, 7 ==> score: 0.972012
1, poly, scale, 9 ==> score: 0.966190
1, rbf, scale ==> score: 0.952345
10, poly, scale, 5 ==> score: 0.973726
10, poly, scale, 7 ==> score: 0.968524
10, poly, scale, 9 ==> score: 0.963060
10, rbf, scale ==> score: 0.964964
100, poly, scale, 5 ==> score: 0.971429
100, poly, scale, 7 ==> score: 0.965476
100, poly, scale, 9 ==> score: 0.963107
100, rbf, scale ==> score: 0.974083


In [23]:
print(best_score)
print(best)

0.9740833333333333
{'C': 100, 'kernel': 'rbf', 'degree': None, 'gamma': 'scale'}


In [24]:
cls = SVC(C=100, kernel='rbf', gamma='scale', random_state=27)
_ = cls.fit(x_train, y_train)
score = cls.score(x_test, y_test)
print(score)

0.9748046296296297


# Random Forest

In [5]:
N_Estimators = [25, 50, 75, 100, 125, 150]
criterions = ['gini', 'entropy']
max_depths = [2, 3, 5, 10, None]
max_Features = ['sqrt', 'log2', None]

In [6]:
best = {"n_estimators":None, "criterion":None, "max_depth":None, "max_features":None}
best_score = 0.0

In [7]:
for n_estimators in N_Estimators:
    for criterion in criterions:
        for max_depth in max_depths:
            for max_features in max_Features:
                cls = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion,
                                            max_depth=max_depth, max_features=max_features,
                                            n_jobs=11, bootstrap=False, random_state=27)
                _ = cls.fit(x_train, y_train)
                score = cls.score(x_val, y_val)
                print("%s, %s, %s, %s ==> score: %f" %(n_estimators, criterion, max_depth, max_features, score))
                if score > best_score:
                    best_score = score
                    best['n_estimators'] = n_estimators
                    best['criterion'] = criterion
                    best['max_depth'] = max_depth
                    best['max_features'] = max_features

25, gini, 2, sqrt ==> score: 0.800643
25, gini, 2, log2 ==> score: 0.780464
25, gini, 2, None ==> score: 0.678536
25, gini, 3, sqrt ==> score: 0.888774
25, gini, 3, log2 ==> score: 0.886167
25, gini, 3, None ==> score: 0.810476
25, gini, 5, sqrt ==> score: 0.924857
25, gini, 5, log2 ==> score: 0.922619
25, gini, 5, None ==> score: 0.891643
25, gini, 10, sqrt ==> score: 0.960929
25, gini, 10, log2 ==> score: 0.959929
25, gini, 10, None ==> score: 0.937690
25, gini, None, sqrt ==> score: 0.972750
25, gini, None, log2 ==> score: 0.971750
25, gini, None, None ==> score: 0.946714
25, entropy, 2, sqrt ==> score: 0.807845
25, entropy, 2, log2 ==> score: 0.821250
25, entropy, 2, None ==> score: 0.782405
25, entropy, 3, sqrt ==> score: 0.888012
25, entropy, 3, log2 ==> score: 0.892095
25, entropy, 3, None ==> score: 0.848560
25, entropy, 5, sqrt ==> score: 0.925952
25, entropy, 5, log2 ==> score: 0.925345
25, entropy, 5, None ==> score: 0.895024
25, entropy, 10, sqrt ==> score: 0.964440
25, ent

In [8]:
print(best_score)
print(best)

0.9757619047619047
{'n_estimators': 150, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt'}


In [9]:
cls = RandomForestClassifier(n_estimators=150, criterion='entropy', max_depth=None, max_features='sqrt',
                                            n_jobs=11, bootstrap=False, random_state=27)
_ = cls.fit(x_train, y_train)
score = cls.score(x_test, y_test)
print(score)

0.9753685185185185
