In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from vecstack import stacking
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
train_set = pd.read_csv("../train_cap2018.csv")
(nrow_default, ncol_default) = train_set.shape
features_list_default = train_set.columns.values[1:ncol_default-1]
features_list_default

array(['sentences', 'words', 'letters.all', 'syllables', 'punct',
       'avg.sentc.length', 'avg.word.length', 'avg.syll.word',
       'sntc.per.word', 'TTR', 'ARI', 'Bormuth', 'Coleman.C1',
       'Coleman.C2', 'Coleman.C3', 'Coleman.C4', 'Coleman.Liau',
       'Dale.Chall', 'Danielson.Bryan.DB1', 'Danielson.Bryan.DB2',
       'Dickes.Steiwer', 'DRP', 'ELF', 'Farr.Jenkins.Paterson', 'Flesch',
       'Flesch.Kincaid', 'FOG', 'FORCAST', 'Fucks', 'Linsear.Write',
       'LIX', 'nWS1', 'nWS2', 'nWS3', 'nWS4', 'RIX', 'SMOG', 'Spache',
       'Strain', 'Traenkle.Bailer.TB1', 'Traenkle.Bailer.TB2', 'TRI',
       'Tuldava', 'Wheeler.Smith', 'text', 'CTTR', 'HD-D (vocd-D)',
       "Herdan's C", 'Maas a', 'Maas lgV0', 'MATTR', 'MSTTR', 'MTLD',
       'Root TTR', 'Summer', 'TTR.1', 'Uber index', "Yule's K"],
      dtype=object)

## Drop string values

In [4]:
df = train_set

In [5]:
def prepare_data(df):
    df = df.drop('fulltext', axis = 1)
    df = df.drop('MATTR', axis=1)
    df = df.drop('MSTTR', axis=1)
    return df

def split_df_to_data_target(df):
    # Get list of features w/o feature containing string values
    (nrow, ncol) = df.shape
    features = df.columns.values[0:ncol-1]

    # Separating out the features
    x = df.loc[:, features].values

    # Separating out the target
    y = df.loc[:,['level1']].values
    return x,y

def standard_scale_x_data(x):
    return StandardScaler().fit_transform(x)

In [6]:
df = prepare_data(df)
x, y = split_df_to_data_target(df)

In [7]:
(nrow, ncol) = df.shape
ncol

57

## Dropping MATTR & MSTTR columns

In [8]:
# Get list of features w/o feature containing string values
(nrow, ncol) = df.shape
features = df.columns.values[0:ncol-1]


# Separating out the features
x = df.loc[:, features].values

# Separating out the target
y = df.loc[:,['level1']].values

In [9]:
df.describe()

Unnamed: 0,sentences,words,letters.all,syllables,punct,avg.sentc.length,avg.word.length,avg.syll.word,sntc.per.word,TTR,...,HD-D (vocd-D),Herdan's C,Maas a,Maas lgV0,MTLD,Root TTR,Summer,TTR.1,Uber index,Yule's K
count,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,...,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0,27310.0
mean,6.178103,69.591578,286.920871,94.919443,13.445405,13.808023,4.068831,1.342988,0.10253,0.723658,...,29.973272,0.919785,0.209477,4.36211,60.654317,5.621359,0.839242,0.723612,25.751004,175.234313
std,3.565665,41.665862,183.019658,60.276153,7.705855,11.104274,0.499349,0.145937,0.061869,0.101662,...,6.041291,0.02988,0.038736,0.985314,29.939369,1.308844,0.097361,0.101676,13.431118,96.680399
min,1.0,5.0,21.0,6.0,0.0,1.4375,2.230769,1.0,0.005435,0.294118,...,4.0,0.72,0.06,1.1,5.36,1.77,-2.51,0.29,4.0,11.34
25%,4.0,38.0,149.0,49.0,8.0,8.0,3.741098,1.24359,0.064516,0.652174,...,27.3825,0.9,0.19,3.73,39.675,4.62,0.81,0.65,18.75,110.19
50%,6.0,62.0,250.0,83.0,12.0,11.0,4.028037,1.333333,0.090909,0.714286,...,32.0,0.92,0.21,4.31,55.07,5.53,0.86,0.71,23.02,152.0
75%,8.0,92.0,380.0,125.0,17.0,15.5,4.34375,1.427251,0.125,0.791667,...,34.22,0.94,0.23,4.84,75.4,6.56,0.89,0.79,28.56,214.5
max,50.0,464.0,2050.0,654.0,122.0,184.0,9.913043,3.173913,0.695652,0.97619,...,41.0,0.99,0.5,14.23,493.92,11.93,3.51,0.98,251.78,1564.1


## Standardizing the features

In [10]:
x = StandardScaler().fit_transform(x)

In [11]:
df_x = df[df.columns[:ncol-1]]

In [12]:
df_x = StandardScaler().fit_transform(df_x)

In [13]:
df_x.shape

(27310, 56)

# Split x, y into train and test data

In [14]:
# from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from mlxtend.evaluate import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

In [15]:
# dividing X, y into train and test data
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 0)

In [16]:
# Cost matrix is given by http://cap2018.litislab.fr/competition_en.pdf
# Lower is better.
def compute_performance_from_confusion_matrix(confusion_matrix):
    cost_matrix = np.array([[0,1,2,3,4,6], [1,0,1,4,5,8],[3,2,0,3,5,8], [10,7,5,0,2,7], [20,16,12,4,0,8], [44,38,32,19,13,0]])
    n = confusion_matrix.sum()
    # Performance E = Dot product of cost matrix and confusion matrix / nb_sample
    performance_E = np.vdot(cost_matrix, confusion_matrix)/n
    return performance_E
def compute_performance_from_predictions(ground_truth, predictions):
    cm = confusion_matrix(np.ravel(ground_truth, order='C'), predictions)
    cost_matrix = np.array([[0,1,2,3,4,6], [1,0,1,4,5,8],[3,2,0,3,5,8], [10,7,5,0,2,7], [20,16,12,4,0,8], [44,38,32,19,13,0]])
    n = cm.sum()
    # Performance E = Dot product of cost matrix and confusion matrix / nb_sample
    performance_E = np.vdot(cost_matrix, cm)/n
    return performance_E

custom_loss = make_scorer(compute_performance_from_predictions, greater_is_better=False)

# SVM Base model

In [17]:
from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC

In [18]:
C = 70.3639
svm_model = SVC(C = C, gamma=0.0097).fit(X_train,np.ravel(y_train,order='C'))

In [19]:
svm_predictions = svm_model.predict(X_test)
accuracy = svm_model.score(X_test, y_test)
print(accuracy)
# Result = 0.8255711775043937
compute_performance_from_predictions(y_test, svm_predictions)

0.8255711775043937


0.4094903339191564

# Find hyperparameters for classifier
**Random Forest, ExtraTreesClassifier, XGB Classifier, ... **

In [17]:
from hpsklearn import HyperoptEstimator, any_sparse_classifier, tfidf, any_classifier
from hyperopt import tpe
import array

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [21]:
estim = HyperoptEstimator( classifier=any_classifier('clf'),  
                            algo=tpe.suggest, trial_timeout=300)

In [22]:
y_train
X_train.shape

(20482, 56)

In [23]:
estim.fit( X_train, y_train.ravel(), verbose=1 )

In [29]:
predictions = estim.predict(X_test)

  if diff:


In [30]:
predictions

array(['A2', 'B2', 'A1', ..., 'A1', 'A1', 'B2'], dtype=object)

In [35]:
print( estim.score( X_test, y_test.ravel()))
print(estim.best_model())
compute_performance_from_predictions(y_test, predictions)

0.7989162272993556
{'learner': XGBClassifier(base_score=0.5, booster='gbtree',
       colsample_bylevel=0.5714319639165071,
       colsample_bytree=0.8925838442029801, gamma=0.0034471649109639527,
       learning_rate=0.0007174472831284953, max_delta_step=0, max_depth=6,
       min_child_weight=9, missing=nan, n_estimators=200, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0.003530449335628467, reg_lambda=1.1119234618341498,
       scale_pos_weight=1, seed=4, silent=True,
       subsample=0.6378778357947126), 'preprocs': (), 'ex_preprocs': ()}


  if diff:


0.4320445225541886

In [37]:
xgbc_clf_best = estim.best_model()['learner']
# xgbc_clf_best = XGBClassifier(base_score=0.5, booster='gbtree',
#        colsample_bylevel=0.5714319639165071,
#        colsample_bytree=0.8925838442029801, gamma=0.0034471649109639527,
#        learning_rate=0.0007174472831284953, max_delta_step=0, max_depth=6,
#        min_child_weight=9, missing=nan, n_estimators=200, n_jobs=1,
#        nthread=None, objective='multi:softprob', random_state=0,
#        reg_alpha=0.003530449335628467, reg_lambda=1.1119234618341498,
#        scale_pos_weight=1, seed=4, silent=True,
#        subsample=0.6378778357947126)

In [39]:
predict =  xgbc_clf_best.predict(X_test)

  if diff:


0.4320445225541886

## Find hyper parameter for Neural Net

In [18]:
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from keras import optimizers
from keras.optimizers import RMSprop

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [19]:
def model(X_train, Y_train, X_test, Y_test):
    '''
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    '''
    from keras.models import Sequential
    from keras.layers.core import Dense, Dropout, Activation
    from keras import optimizers
    from keras.optimizers import RMSprop

    model = Sequential()
    model.add(Dense(512, input_shape=(784,)))
    model.add(Activation('relu'))
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense({{choice([256, 512, 1024])}}))
    model.add(Activation({{choice(['relu', 'sigmoid'])}}))
    model.add(Dropout({{uniform(0, 1)}}))

    # If we choose 'four', add an additional fourth layer
    if conditional({{choice(['three', 'four'])}}) == 'four':
        model.add(Dense(100))
        # We can also choose between complete sets of layers
        model.add({{choice([Dropout(0.5), Activation('linear')])}})
        model.add(Activation('relu'))

    model.add(Dense(10))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer={{choice(['adam', 'sgd'])}})

    model.fit(X_train, Y_train,
              batch_size={{choice([32, 64, 128])}},
              nb_epoch=1,
              show_accuracy=True,
              verbose=2,
              validation_data=(X_test, Y_test))
    score, acc = model.evaluate(X_test, Y_test, show_accuracy=True, verbose=0)
    print('Test accuracy:', acc)
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

In [124]:
def model(X_train, Y_train, X_test, Y_test):
    from keras import optimizers
    '''
    Model providing function:

    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    '''
    model = Sequential()
    model.add(Dense(50, input_shape=(784,)))
    model.add(Activation('relu'))
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense({{choice([50, 45, 40])}}))
    model.add(Activation('relu'))
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense(10))
    model.add(Activation('softmax'))

#     rms = keras.optimizers.RMSprop()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train, Y_train,
              batch_size={{choice([64, 128])}},
              nb_epoch=1,
              verbose=2,
              validation_data=(X_test, Y_test))
    score, acc = model.evaluate(X_test, Y_test, verbose=0)
    print('Test accuracy:', acc)
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}

In [21]:
X_train

array([[ 1.63279598,  0.99383952,  0.83642344, ..., -1.11741273,
        -0.45350769, -0.35182867],
       [ 0.51096503, -0.49421647, -0.56235902, ..., -0.33058453,
        -0.57710354,  1.80614567],
       [ 0.2305073 ,  0.94583771,  0.55775974, ..., -1.31411978,
        -0.58082631, -0.10616941],
       ...,
       [ 0.51096503, -0.01419841, -0.28369533, ..., -0.52729158,
        -0.48775714,  0.57722878],
       [ 0.2305073 ,  0.39381694,  0.18074416, ..., -1.31411978,
        -0.68953109,  0.98455768],
       [ 2.19371145,  1.85787203,  1.75437443, ..., -1.11741273,
        -0.27555945, -0.34779469]])

In [23]:
def datasets():
    return X_train, y_train.ravel(), X_test, y_test.ravel()
X_train, Y_train, X_test, Y_test = datasets()
trials = Trials()
# best_run, best_model = optim.minimize(model=model,
#                                           data=datasets,
#                                           algo=tpe.suggest,
#                                           max_evals=5,
#                                           trials=trials)
if __name__ == '__main__':
    best_run, best_model = optim.minimize(model=model,
                                      data=datasets,
                                      algo=tpe.suggest,
                                      max_evals=5,
                                      trials=Trials(),
                                      notebook_name='Untitled')

>>> Imports:
#coding=utf-8

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import sklearn
except:
    pass

try:
    from sklearn.preprocessing import StandardScaler
except:
    pass

try:
    from sklearn.decomposition import PCA
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    import seaborn as sns
except:
    pass

try:
    from sklearn.ensemble import ExtraTreesClassifier
except:
    pass

try:
    from sklearn.ensemble import RandomForestClassifier
except:
    pass

try:
    from xgboost import XGBClassifier
except:
    pass

try:
    from vecstack import stacking
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from sklearn.metrics import accuracy_score
except:
    pass

try:
    from mlxtend.plotting import plot_confusion_matrix
except:
    pass

try:
    from mlxtend.evaluate import confusion_matrix
except:
    pass

try:
    from

NameError: name 'X_train' is not defined

# Model stacking

In [41]:
models = [
    ExtraTreesClassifier(random_state=0, 
                         n_estimators=200),
        
    RandomForestClassifier(random_state=0, 
                           n_estimators=200),
        
    xgbc_clf_best,
    SVC(C = C, gamma=0.0097)
]

In [42]:
# from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()

In [43]:
y_train_encoded = label_encoder.fit_transform(y_train.flatten())
y_test_encoded = label_encoder.fit_transform(y_test.flatten())

In [44]:
S_train, S_test = stacking(models,                     # list of models
                           X_train, y_train_encoded.ravel(), X_test,   # data
                           regression=False,           # classification task (if you need 
                                                       #     regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                                                       #     set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need 
                                                       #     probabilities - set to True) 
                           save_dir=None,              # do not save result and log (to save 
                                                       #     in current dir - set to '.')
                           metric=accuracy_score,      # metric: callable
                           n_folds=5,                  # number of folds
                           stratified=True,            # stratified split for folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproducibility
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [6]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [ExtraTreesClassifier]
    fold  0:  [0.78219512]
    fold  1:  [0.77568953]
    fold  2:  [0.79023199]
    fold  3:  [0.77289377]
    fold  4:  [0.77728938]
    ----
    MEAN:     [0.77965996] + [0.00608868]
    FULL:     [0.77966019]

model  1:     [RandomForestClassifier]
    fold  0:  [0.80487805]
    fold  1:  [0.79741274]
    fold  2:  [0.80610501]
    fold  3:  [0.79609280]
    fold  4:  [0.79902320]
    ----
    MEAN:     [0.80070236] + [0.00403768]
    FULL:     [0.80070306]

model  2:     [XGBClassifier]


  if diff:
  if diff:


    fold  0:  [0.78780488]


  if diff:
  if diff:


    fold  1:  [0.78911399]


  if diff:
  if diff:


    fold  2:  [0.79023199]


  if diff:
  if diff:


    fold  3:  [0.78217338]


  if diff:
  if diff:


    fold  4:  [0.78778999]
    ----
    MEAN:     [0.78742284] + [0.00277809]
    FULL:     [0.78742310]

model  3:     [SVC]
    fold  0:  [0.81463415]
    fold  1:  [0.80693190]
    fold  2:  [0.81489621]
    fold  3:  [0.79975580]
    fold  4:  [0.80439560]
    ----
    MEAN:     [0.80812273] + [0.00589226]
    FULL:     [0.80812421]



In [51]:
S_train[:5]

array([[2, 2, 2],
       [0, 0, 0],
       [2, 2, 2],
       [0, 0, 0],
       [1, 1, 1]])

## Stacked model : 2nd layer, naive (defauilt) model with XGBClassifier

In [45]:
# Initialize 2nd level model
model = XGBClassifier(random_state=0, learning_rate=0.01, 
                      n_estimators=100)
    
# Fit 2nd level model
model = model.fit(S_train, y_train_encoded)

# Predict
y_pred = model.predict(S_test)

# Final prediction score
print('Final prediction score: [%.8f]' % accuracy_score(y_test_encoded, y_pred))
# Final prediction score: [0.82776801]

Final prediction score: [0.82776801]


  if diff:


In [48]:
y_train_encoded

array([3, 0, 2, ..., 1, 1, 3], dtype=int64)

In [46]:
compute_performance_from_predictions(y_test_encoded, y_pred)
# 0.3961628588166374

0.3961628588166374

## Best stacked 2nd layer model

In [47]:
estim_stacked = HyperoptEstimator( classifier=any_classifier('clf'),  
                            algo=tpe.suggest, trial_timeout=500, verbose=True)

In [22]:
y_train
X_train.shape

(20482, 56)

In [50]:
estim_stacked.fit( S_train, y_train_encoded.ravel())

TERMINATING DUE TO TIMEOUT
Training learner ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.16979995312295992,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=98, n_jobs=1, oob_score=False, random_state=0,
           verbose=False, warm_start=False) on X/EX of dimension (20482, 4)


In [73]:
stacked_2nd_layer = estim_stacked.best_model()['learner']
# stacked_2nd_layer_best = ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
#            max_depth=None, max_features=0.16979995312295992,
#            max_leaf_nodes=None, min_impurity_decrease=0.0,
#            min_impurity_split=None, min_samples_leaf=1,
#            min_samples_split=2, min_weight_fraction_leaf=0.0,
#            n_estimators=98, n_jobs=1, oob_score=False, random_state=0,
#            verbose=False, warm_start=False)

In [69]:
predictions = stacked_2nd_layer.predict(S_test)

In [65]:
predictions

array([1, 3, 0, ..., 0, 0, 3], dtype=int64)

In [72]:
print( estim_stacked.score( S_test, predictions))
print(estim_stacked.best_model()['learner'])
print(compute_performance_from_predictions(y_test_encoded, predictions))
# 0.38927943760984185
print(accuracy_score(y_test_encoded, predictions))
# 0.8302577621558289

1.0
ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.16979995312295992,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=98, n_jobs=1, oob_score=False, random_state=0,
           verbose=False, warm_start=False)
0.38927943760984185
0.8302577621558289


In [78]:
estim_stacked_xgboost = HyperoptEstimator( classifier=any_classifier('xgboost'),  
                            algo=tpe.suggest, trial_timeout=500, verbose=True)

In [79]:
estim_xgboost = estim_stacked_xgboost.fit( S_train, y_train_encoded.ravel())

Training learner GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.06765575510106916, loss='deviance',
              max_depth=None, max_features=0.9879187105689301,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=7,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=19, presort='auto', random_state=1,
              subsample=0.6326695839978682, verbose=0, warm_start=False) on X/EX of dimension (20482, 4)


In [83]:
print(accuracy_score(y_train_encoded.ravel(), estim_stacked_xgboost.predict(S_train)))
prediction = estim_stacked_xgboost.predict(S_test)
print( estim_stacked_xgboost.score( S_test, prediction))
# print(estim_stacked.best_model()['learner'])
print(compute_performance_from_predictions(y_test_encoded, prediction))
# 0.3835676625659051
print(accuracy_score(y_test_encoded, prediction))
# 0.830697129466901

0.8161800605409628
1.0
0.3835676625659051
0.830697129466901
