In [253]:
import pandas as pd
import numpy as np
from time import time
import os, sys
import glob
%matplotlib inline

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

scoring = make_scorer(accuracy_score)

# making validation set

In [254]:
df_train  = pd.read_csv("train_ori.csv",header=None,names=["file",'class']).set_index("file")

In [255]:
np.random.seed(100)
class_files = {}
for i in range(1,11,1):
    indexes = df_train[(df_train['class']==i)].index
    class_files[i] = np.random.choice(indexes,int(0.15*len(indexes)) ,replace=False)

 # Ensemble Method

In [240]:
def feature_label(feature="top",dataset = "train"):
    print(feature)
    dfs=[]
    for key in sorted(select_features[feature].keys()):
        df_this = pd.read_csv(select_features[feature][key] % (dataset)).set_index("Id").sort_index() #.astype('category')
       
        columns = [col+"_%s" % (key) for col in df_this.columns] 
        df_this.columns = columns
        
        
        #columns = [col for col in columns  if int(col.split("_")[-2]) in [0,1,2,3,4]]
        
        dfs.append(df_this[columns]) 
    if dataset=='test':
        df_f_l = pd.concat(dfs,axis=1)
        df_f_l["class"] = -1
    else:
        dfs.append(df[dataset])
        df_f_l = pd.concat(dfs,axis=1)
        
    return df_f_l.iloc[:,:-1], df_f_l.iloc[:,[-1]]

In [241]:
select_features = {
                   "raw":{  "model1":"features_raw_fea_S_%s_1.csv",
                            "model2":"features_raw_fea_S_%s_2.csv",
                            "model3":"features_raw_fea_S_%s_3.csv",
                            "model7":"features_raw_fea_S_%s_7.csv",
                         }
                  }

df = {
    "train":pd.read_csv("train.csv",header=None,names=["Id",'class']).set_index("Id").sort_index(),
    "test":pd.read_csv("test.csv",header=None,names=['Id']).set_index("Id").sort_index(),
    "val": pd.DataFrame([[f,key] for key in class_files for f in class_files[key]],columns=['Id',"class"]).set_index("Id").sort_index(),
    "chaval":pd.read_csv("chaval.csv",header=None,names=["Id",'class']).set_index("Id").sort_index(),
}

In [None]:
from sklearn import datasets
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

from scipy.stats import uniform
from xgboost import XGBRegressor

import GPy
import GPyOpt

from GPyOpt.methods import BayesianOptimization

# Load the diabetes dataset (for regression)
X,y = feature_label(feature="raw",dataset = "val")

# Instantiate an XGBRegressor with default hyperparameter settings
clf = XGBClassifier()

bds = [{'name': 'learning_rate', 'type': 'continuous', 'domain': (0.0001, 0.09)},
       {'name': 'n_estimators', 'type': 'discrete', 'domain': (500, 1000)},
       {"name": "max_depth", 'type':'discrete', 'domain': (3, 10)},
        {'name': 'min_child_weight', 'type': 'discrete', 'domain': (1, 5)},
         {'name': 'reg_alpha', 'type': 'continuous', 'domain': (0, 0.05)},
         {'name': 'reg_lambda', 'type': 'continuous', 'domain': (0.1, 1)},
         {'name': 'subsample', 'type': 'continuous', 'domain': (0.1, 0.6)},
         {'name': 'colsample_bytree', 'type': 'continuous', 'domain': (0.1, 0.6)},
        ]

# Optimization objective 
def cv_score(parameters):
    parameters = parameters[0]
    score = cross_val_score(
                    XGBClassifier(learning_rate=parameters[0],\
                        n_estimators=int(parameters[1]),\
                        max_depth=int(parameters[2]),\
                        min_child_weight=int(parameters[3]),\
                        reg_alpha = parameters[4],\
                        reg_lambda = parameters[5],\
                        subsample = parameters[6],\
                        colsample_bytree = parameters[7],\
                        n_jobs=5,\
                        objective= "multi:softprob",\
                        colsample_bylevel = 1,\
                        booster="gbtree",\
                        scale_pos_weight = 1,\
                        gamma = 0,\
                       ), 
                X, y, scoring="accuracy", cv=StratifiedKFold(n_splits=5, shuffle=True,random_state=100),n_jobs=10).mean()
    score = np.array(score)
    return score

optimizer = BayesianOptimization(f=cv_score, 
                                 domain=bds,
                                 model_type='GP',
                                 acquisition_type ='EI',
                                 acquisition_jitter = 0.05,
                                 exact_feval=True, 
                                 maximize=True)

# Only 20 iterations because we have 5 initial random points
optimizer.run_optimization(max_iter=50)

raw


In [246]:
optimizer.Y[np.argsort(optimizer.Y,axis=0).reshape(-1)][:5]

array([[-0.80457988],
       [-0.79552019],
       [-0.79549461],
       [-0.79547003],
       [-0.79440016]])

In [247]:
optimizer.X[np.argsort(optimizer.Y,axis=0).reshape(-1)][:5]

array([[5.39930163e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        3.33308086e-02, 2.42777678e-01, 2.63011921e-01, 1.50900610e-01],
       [7.22438374e-03, 1.00000000e+03, 3.00000000e+00, 1.00000000e+00,
        4.48052850e-02, 8.78036989e-01, 3.88257479e-01, 3.01249371e-01],
       [5.07876924e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        4.40209711e-02, 1.94824491e-01, 2.24155843e-01, 1.35345156e-01],
       [7.52601074e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        3.41071685e-02, 2.75208050e-01, 2.22768460e-01, 1.55850192e-01],
       [2.22540150e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        1.32233022e-02, 2.69048929e-01, 2.49035228e-01, 1.66581821e-01]])

array([[5.39930163e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        3.33308086e-02, 2.42777678e-01, 2.63011921e-01, 1.50900610e-01],
       [7.22438374e-03, 1.00000000e+03, 3.00000000e+00, 1.00000000e+00,
        4.48052850e-02, 8.78036989e-01, 3.88257479e-01, 3.01249371e-01],
       [5.07876924e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        4.40209711e-02, 1.94824491e-01, 2.24155843e-01, 1.35345156e-01],
       [7.52601074e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        3.41071685e-02, 2.75208050e-01, 2.22768460e-01, 1.55850192e-01],
       [2.22540150e-02, 5.00000000e+02, 1.00000000e+01, 1.00000000e+00,
        1.32233022e-02, 2.69048929e-01, 2.49035228e-01, 1.66581821e-01]])


        
array([[-0.80457988],
       [-0.79552019],
       [-0.79549461],
       [-0.79547003],
       [-0.79440016]])

# XGboost

In [29]:
from xgboost import XGBClassifier

X,y = feature_label(feature="raw",dataset = "val")

clf = XGBClassifier(learning_rate=0.05,\
                    n_estimators=700,\
                    min_child_weight=1,\
                    booster="gbtree",\
                    max_depth=5,\
                    scale_pos_weight = 1,\
                    reg_alpha = 0.001,\
                    reg_lambda = 0.3,\
                    subsample = 0.5,\
                    colsample_bytree = 0.3,\
                    n_jobs=5,\
                    objective= "multi:softmax",
                    colsample_bylevel = 1
                   )

param_dist = {  
                "reg_lambda":[0.01,0.1,0.2,0.3]

             }


xgb_search = GridSearchCV(clf, param_grid=param_dist,
                               n_jobs=5,cv=StratifiedKFold(n_splits=5, shuffle=True,random_state=100))

start = time()
xgb_search.fit(X, y.values.ravel(),eval_metric='auc')
print("GridSearchCV took %.2f seconds for %d candidates"
      "parameter settings." % ((time() - start), n_iter_search))

print(xgb_search.best_score_ )
print(xgb_search.best_params_)

raw
GridSearchCV took 36.40 seconds for 200 candidatesparameter settings.
0.803129074315515
{'reg_lambda': 0.3}


# Train model with tune-parameters

In [248]:
from xgboost import XGBClassifier
X1,y1 = feature_label(feature="raw",dataset = "val")
X2,y2 = feature_label(feature="raw",dataset = "train")
X = pd.concat([X1,X2])
y = pd.concat([y1,y2])
np.random.seed(100)
re_sample = {1:2, 2: 1.7, 3:1.7, 4:1.5, 5:1.5, 6:1.55, 7:1.5, 8:0.6, 9:2, 10:2}
indexes = []
for m_class in re_sample:
    index_m_class = y[y['class']==m_class].index
    replace = True
    if re_sample[m_class]<1:
        replace = False
    indexes.append(np.random.choice(index_m_class ,\
                                    int(len(index_m_class)*re_sample[m_class]), replace=replace))

select = np.concatenate(indexes)

X = X.loc[select,:]
y = y.loc[select,:]
'''
clf_xgb = XGBClassifier(learning_rate=0.05,\
                    n_estimators=700,\
                    min_child_weight=1,\
                    booster="gbtree",\
                    max_depth=5,\
                    scale_pos_weight = 1,\
                    reg_alpha = 0.001,\
                    reg_lambda = 0.3,\
                    subsample = 0.5,\
                    colsample_bytree = 0.3,\
                    n_jobs=5,\
                    objective= "multi:softmax",
                    colsample_bylevel = 1
                   )
'''
parameters = optimizer.X[np.argsort(optimizer.Y,axis=0).reshape(-1)][0]
clf_xgb = XGBClassifier(learning_rate=parameters[0],\
                        n_estimators=int(parameters[1]),\
                        max_depth=int(parameters[2]),\
                        min_child_weight=int(parameters[3]),\
                        reg_alpha = parameters[4],\
                        reg_lambda = parameters[5],\
                        subsample = parameters[6],\
                        colsample_bytree = parameters[7],\
                        n_jobs=5,\
                        objective= "multi:softprob",\
                        colsample_bylevel = 1,\
                        booster="dart",\
                        scale_pos_weight = 1,\
                        gamma = 0,\
                       )

clf_xgb.fit(X,y)


raw
raw


XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.15090061000010793, gamma=0,
       learning_rate=0.05399301625109518, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=5,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0.03333080862715069, reg_lambda=0.2427776778906675,
       scale_pos_weight=1, seed=None, silent=True,
       subsample=0.26301192055025346)

In [249]:
y_pred = clf_xgb.predict(X)
y_true = y.values.ravel()
print("confusion_matrix: ")
print(confusion_matrix(y_true, y_pred))
print("accuracy_score:",accuracy_score(y_true, y_pred))

confusion_matrix: 
[[  72    0    0    0    0    0    0    0    0    0]
 [   0  755    1    0    0    0    0    2    0    0]
 [   0    0  888    0    0    0    0    1    0    0]
 [   0    0    0  553    0    0    0    0    0    0]
 [   0    0    0    0  588    0    0    0    0    0]
 [   0    0    0    0    0  576    0    0    0    0]
 [   0    0    0    0    0    0  416    2    0    0]
 [   0    2    3    0    0    0    0 1546    0    0]
 [   0    0    0    0    0    0    0    0  124    0]
 [   0    0    0    0    0    0    0    0    0  186]]
('accuracy_score:', 0.9980752405949256)


# My validation data set < download from the internet>

In [250]:
X,y = feature_label(feature="raw",dataset = "chaval")
y_pred = clf_xgb.predict(X)
y_true = y.values.ravel()
print("confusion_matrix: ")
print(confusion_matrix(y_true, y_pred))
print("accuracy_score:",accuracy_score(y_true, y_pred))

raw
confusion_matrix: 
[[ 10   1  11   0   0   6   0  16   0   0]
 [  0  16   3   0   0   2   0   4   0   0]
 [  0   0   5   0   3   0   0  34   0   0]
 [  0   0   0  31   0   0   0   0   0   0]
 [  4   0   1   0  38   0   1   1   0   0]
 [  0   2   0   0   0  43   0   7   0   1]
 [  0   1   1   0   2   0  22  55   0   0]
 [  0   2  13   0   0   5   0 123   1   0]
 [  0   0   2   8   0   6   0  35 141   5]
 [  0   0   0   0   0   0   1   4   0  38]]
('accuracy_score:', 0.6624113475177305)


# TEST is to submit

In [251]:
X_test,_ = feature_label(feature="raw",dataset = "test")



raw


In [252]:
df_submit = pd.DataFrame({"Id":X_test.index.values,"Genre":clf_xgb.predict(X_test)}).set_index("Id").reset_index()
df_submit.to_csv("xgb_submission.csv",header=True,index=False)



In [191]:
from sklearn.externals import joblib

joblib.dump(clf_xgb, "xgb_search.sav")

['xgb_search.sav']

# GradientBoostingClassifier

In [201]:
from sklearn.ensemble import GradientBoostingClassifier

X,y = feature_label(feature="raw",dataset = "val")
'''
X2,y2 = feature_label(feature="raw",dataset = "train")
X = pd.concat([X1,X2])
y = pd.concat([y1,y2])
'''
clf = GradientBoostingClassifier(n_estimators=200,\
                                 min_samples_leaf=2,\
                                 learning_rate = 0.01,\
                                 max_features="auto",\
                                 max_depth=7,\
                                 subsample = 0.3)


param_dist = { 
              'n_estimators': [400,500,600,700],\
              'max_depth':[5,6,7,8,9,10,11,12]
              }

n_iter_search = 200 #n_iter=n_iter_search,
grab_search = GridSearchCV(clf, param_grid=param_dist,
                              n_jobs=5,cv=StratifiedKFold(n_splits=5, shuffle=True,random_state=7))

start = time()
grab_search.fit(X, y.values.ravel())
print("GridSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

raw
GridSearchCV took 2233.29 seconds for 200 candidates parameter settings.


In [215]:
print(grab_search.best_score_ )
print(grab_search.best_params_)



0.8005215123859192
{'n_estimators': 400, 'max_depth': 6}


# Train model with tune-parameters

In [203]:
from sklearn.ensemble import GradientBoostingClassifier

X1,y1 = feature_label(feature="raw",dataset = "val")
X2,y2 = feature_label(feature="raw",dataset = "train")
X = pd.concat([X1,X2])
y = pd.concat([y1,y2])
# down sampling at class 8
np.random.seed(100)
re_sample = {1:2, 2: 1.7, 3:1.7, 4:1.5, 5:1.5, 6:1.55, 7:1.5, 8:0.6, 9:2, 10:2}
indexes = []
for m_class in re_sample:
    index_m_class = y[y['class']==m_class].index
    replace = True
    if re_sample[m_class]<1:
        replace = False
    indexes.append(np.random.choice(index_m_class , int(len(index_m_class)*re_sample[m_class]), replace=replace))

select = np.concatenate(indexes)

X = X.loc[select,:]
y = y.loc[select,:]


clf_grab = GradientBoostingClassifier(n_estimators=400,\
                                 min_samples_leaf=2,\
                                 learning_rate = 0.01,\
                                 max_features="auto",\
                                 max_depth=6,\
                                 subsample = 0.3)


clf_grab.fit(X,y)


raw
raw


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=6,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              presort='auto', random_state=None, subsample=0.3, verbose=0,
              warm_start=False)

In [204]:
y_pred = clf_grab.predict(X)
y_true = y.values.ravel()
confusion_matrix(y_true, y_pred)



array([[  72,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  750,    2,    0,    0,    0,    0,    6,    0,    0],
       [   0,    0,  883,    0,    0,    0,    0,    6,    0,    0],
       [   0,    0,    0,  553,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,  588,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,  576,    0,    0,    0,    0],
       [   0,    0,    0,    1,    0,    0,  414,    3,    0,    0],
       [   0,    5,    4,    0,    0,    0,    2, 1540,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,  124,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,  186]])

In [205]:
print(accuracy_score(y_true, y_pred))

0.994925634295713


# My validation data set < download from the internet>

In [206]:
X,y = feature_label(feature="raw",dataset = "chaval")
y_pred = clf_grab.predict(X)
y_true = y.values.ravel()
print("confusion_matrix: ")
print(confusion_matrix(y_true, y_pred))
print("accuracy_score:",accuracy_score(y_true, y_pred))

raw
confusion_matrix: 
[[ 10   0  12   0   0   4   1  17   0   0]
 [  0  16   4   1   0   0   1   3   0   0]
 [  0   0   6   0   3   0   0  33   0   0]
 [  0   0   0  31   0   0   0   0   0   0]
 [  1   2   1   0  35   0   2   4   0   0]
 [  0   1   0   0   0  43   0   9   0   0]
 [  0   3   2   0   1   0  25  50   0   0]
 [  0   1  15   0   0   2   1 125   0   0]
 [  0   3   4   9   0   4   3  34 137   3]
 [  0   0   0   0   0   0   1   4   0  38]]
('accuracy_score:', 0.6609929078014184)


# TEST is to submit

In [216]:
X_test,_ = feature_label(feature="raw",dataset = "test")


df_submit = pd.DataFrame({"Id":X_test.index.values,"Genre":clf_grab.predict(X_test)}).set_index("Id").reset_index()
df_submit.to_csv("grab_submission.csv",header=True,index=False)




raw


# SAVE mode

In [263]:
from sklearn.externals import joblib

joblib.dump(clf_grab, "grab_search.sav")
joblib.dump(clf_xgb, "xgb_search.sav")


['xgb_search.sav']

In [264]:
model = joblib.load("xgb_search.sav")

X_test,_ = feature_label(feature="raw",dataset = "test")

df_submit = pd.DataFrame({"Id":X_test.index.values,"Genre":model.predict(X_test)}).set_index("Id").reset_index()
df_submit.to_csv("submission.csv",header=True,index=False)


raw


In [227]:
model = joblib.load("xgb_search.sav")

In [228]:
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.2687558325056498, gamma=0,
       learning_rate=0.005966833485049597, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=nan, n_estimators=500, n_jobs=5,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0.05, reg_lambda=0.29025943448521924, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.5949681248740692)