# Final version of the classifiers and meta classifier script


### Parameter settings

First, parameters are set. The parameters are obtained from a crossvalidation performed for each seperate classifier. 

In [1]:
# Attributes from dataset that we use for classification

nAttributes = ['Age', 'popularity resquer id', 'Breed1', 'img_pixels', 'PhotoAmt', 'Sterilized', 'description length',
               'img_ave_contrast', 'Breed2', 'Quantity', 'Gender', 'img_metadata_sentiment2', 'beaut', 'MaturitySize',
               'State',  'Color3', 'vaccin', 'abandon', 'Vaccinated', 'Fee', 'indoor', 'cute', 'great']


max_depth = 8  #max depth of decision tree
n_estimators = 10  # number of trees in random forest
tol = 0.01  #tolerance in (gradient/line) search in Support Vector Machine classifier, Logistic Regression
nn = 15  # number of neighbors for K-Nearest Neighbor classifier
xgb_params = {  #parameters for XGBoost
    'eval_metric': 'rmse',
    'seed': 1337,
    'verbosity': 0,
}   

### Some additional pre-processing

Data is loaded and relevant attributes are obtained. A gaussian transform and normalisation transform is applied to the appropriate variables. Dataset is also converted with dummy variables for categorical attributes so that Logistic Regression can be performed. 

### Meta train/test set

The train data is divided into a meta train and meta test set, to train and test the meta classifier respectively. 10% of the train data is used for meta testing, which helps select the best model for meta classifying.  

In [2]:
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

from sklearn import model_selection
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn import preprocessing



data = pd.read_csv('Data/preprocessedTrain3.csv') #import data
X = data.loc[:, data.columns != 'AdoptionSpeed'] #create X without labels
X = X.fillna(0)
X = X.drop('Description',axis=1) #drop non numerical values
X = X.drop('PetID',axis=1) #
X = X.drop('RescuerID',axis=1)
X = X.drop('Unnamed: 0',axis=1)
X = X.drop('Unnamed: 0.1',axis=1)
X = X.drop('img_metadata_label',axis=1)
X = X[nAttributes]
y = data['AdoptionSpeed'] #label vector

test = pd.read_csv('Data/preprocessedtest3.csv')


X_test = test.drop('Description',axis=1) #drop non numerical values
X_test = X_test.fillna(0)
id = X_test['PetID']
X_test = X_test.drop('PetID',axis=1) #
X_test = X_test.drop('RescuerID',axis=1)
X_test = X_test.drop('Unnamed: 0',axis=1)
X_test = X_test.drop('Unnamed: 0.1',axis=1)

X_test = X_test.drop('img_metadata_label',axis=1)
X_test = X_test[nAttributes]


non_zer0 = np.mean(X==0)==0
zero = non_zer0[non_zer0.values==False].index
non_zer0 = non_zer0[non_zer0.values==True].index

scaler = preprocessing.PowerTransformer(method='box-cox', standardize=True).fit(X[non_zer0])
X[non_zer0] = scaler.transform(X[non_zer0])
X_test[non_zer0] = scaler.transform(X_test[non_zer0])
scaler = preprocessing.StandardScaler().fit(X[zero])
X[zero] = scaler.transform(X[zero])
X_test[zero] = scaler.transform(X_test[zero])

meta_train, meta_test, meta_y_train, meta_y_test = model_selection.train_test_split(X,y,test_size=0.1,stratify=y)



Xlr_train = meta_train
Xlr_m_test = meta_test
Xlr_test = X_test
dummy = ['State','Type','Breed1','Breed2','Gender','Color1','Color2','Color3','Vaccinated','Dewormed','Sterilized']
for d in dummy:
    if(d in nAttributes):
        
        train = pd.get_dummies(Xlr_train[d],prefix=d)
        test = pd.get_dummies(Xlr_test[d],prefix=d)
        m_test = pd.get_dummies(Xlr_m_test[d],prefix=d)
        result = set(list(train))
        result.intersection_update(list(test))
        result.intersection_update(list(m_test))
        one_hottr = train[list(result)]
        one_hot = test[list(result)]
        one_hotm = m_test[list(result)]
        Xlr_train = Xlr_train.drop(d,axis = 1)
        # Join the encoded df
        Xlr_train = Xlr_train.join(one_hottr)
        
        Xlr_test = Xlr_test.drop(d,axis = 1)
        Xlr_test = Xlr_test.join(one_hot)
        Xlr_m_test = Xlr_m_test.drop(d,axis=1)
        Xlr_m_test = Xlr_m_test.join(one_hotm)

  llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0))
  return self.partial_fit(X, y)


## Classifiers

The following classifiers were trained on the meta train data:

+ Decision Tree classifier
+ Random Forest classifier
+ Logistic Regression
+ Support Vector Machine classifier
+ K-nearest Neighbor classifier
+ Naive Bayes classifier
+ XG Boost


The predictions from these classifiers on the train data, the meta test data and the actual test data are obtained. 

In [3]:
classifiers = 'DTC RF LOGREG KNN SVM GNB XGB'.split(sep=' ')
predictions = np.zeros((len(X_test),len(classifiers)))
mlp_train = np.zeros((len(meta_train),len(classifiers)))
mlp_test = np.zeros((len(meta_test),len(classifiers)))

dtc = tree.DecisionTreeClassifier(criterion='gini',max_depth=max_depth) #train decision tree
dtc = dtc.fit(meta_train,meta_y_train)
predictions[:,0] = dtc.predict(X_test)
mlp_train[:,0] = dtc.predict(meta_train)
mlp_test[:,0] = dtc.predict(meta_test)

rf = RandomForestRegressor(n_estimators = n_estimators) #train random forest
rf = rf.fit(meta_train, meta_y_train)
predictions[:,1] = np.round(rf.predict(X_test),0)
mlp_train[:,1] = np.round(rf.predict(meta_train),0)
mlp_test[:,1] = np.round(rf.predict(meta_test),0)

logreg = LogisticRegression(tol=tol,solver='liblinear',multi_class='auto')  # train logistic regressor
logreg = logreg.fit(Xlr_train, meta_y_train)
predictions[:,2] = logreg.predict(Xlr_test)
mlp_train[:,2] = logreg.predict(Xlr_train)
mlp_test[:,2] = logreg.predict(Xlr_m_test)

knn = KNeighborsClassifier(nn)   # train KNN
knn = knn.fit(meta_train, meta_y_train)
predictions[:,3] = knn.predict(X_test)
mlp_train[:,3] = knn.predict(meta_train)
mlp_test[:,3] = knn.predict(meta_test)

svm = SVC(tol=tol,gamma='auto')   # train SVM
svm = svm.fit(meta_train, meta_y_train)
predictions[:,4] = svm.predict(X_test)
mlp_train[:,4] = svm.predict(meta_train)
mlp_test[:,4] = svm.predict(meta_test)

gnb = GaussianNB()   # train Naive Bayes
gnb = gnb.fit(meta_train, meta_y_train)
predictions[:,5] = gnb.predict(X_test)
mlp_train[:,5] = dtc.predict(meta_train)
mlp_test[:,5] = knn.predict(meta_test)

d_train = xgb.DMatrix(data=meta_train, label=meta_y_train, feature_names=meta_train.columns)  # train XG boost
d_val = xgb.DMatrix(data=meta_test,label=meta_y_test, feature_names=meta_test.columns)
evallist = [(d_val, 'eval'), (d_train, 'train')]
model = xgb.train(dtrain=d_train, num_boost_round=30000, evals=evallist, early_stopping_rounds=3000, verbose_eval=3000, params=xgb_params)
predictions[:,6] = np.round(model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), ntree_limit=model.best_ntree_limit),0)
mlp_train[:,6] = np.round(model.predict(xgb.DMatrix(meta_train, feature_names=meta_train.columns), ntree_limit=model.best_ntree_limit),0)
mlp_test[:,6] = np.round(model.predict(xgb.DMatrix(meta_test, feature_names=meta_test.columns), ntree_limit=model.best_ntree_limit),0)

  if getattr(data, 'base', None) is not None and \


[0]	eval-rmse:1.80809	train-rmse:1.80647
Multiple eval metrics have been passed: 'train-rmse' will be used for early stopping.

Will train until train-rmse hasn't improved in 3000 rounds.
[3000]	eval-rmse:1.1454	train-rmse:0.008655
[6000]	eval-rmse:1.14561	train-rmse:0.007215
Stopping. Best iteration:
[3305]	eval-rmse:1.14561	train-rmse:0.007215



### Classifier performances

In [4]:
correct = np.zeros((len(meta_train),len(classifiers)))
correct_test = np.zeros((len(meta_test),len(classifiers)))
for i in range(len(classifiers)):
    err = 1-np.mean(mlp_train[:,i]==meta_y_train)
    print('Train error for {} is: {:.4f}'.format(classifiers[i],err))
    err = 1-np.mean(mlp_test[:,i]==meta_y_test)
    print('Test error for {} is: {:.4f}'.format(classifiers[i],err))
    print()
    correct[:,i] = mlp_train[:,i] == meta_y_train
    correct_test[:,i] = mlp_test[:,i] == meta_y_test
    if(min(mlp_train[:,i])<0):
        print(classifiers[i])
        mlp_train[mlp_train[:,i]<0,i] = 0
    if(np.any(np.isnan(mlp_train[:,i]))):
        print(classifiers[i])
        
    if(max(mlp_train[:,i]>4)):
        print(classifiers[i])
        mlp_train[mlp_train[:,i]>4,i] = 4
    if(np.all(np.isfinite(mlp_train[:,i]))==0):
        print(classifiers[i])
        
correctdf = pd.DataFrame(correct)
correct_testdf = pd.DataFrame(correct_test)
print('In total, {:.2f}% of the meta training set is classified correctly by at least one classifier'.format(correctdf.max(axis=1).mean()*100))
print('In total, {:.2f}% of the meta test set is classified correctly by at least one classifier'.format(correct_testdf.max(axis=1).mean()*100))

Train error for DTC is: 0.5428
Test error for DTC is: 0.5807

Train error for RF is: 0.2477
Test error for RF is: 0.6827

Train error for LOGREG is: 0.5924
Test error for LOGREG is: 0.5987

Train error for KNN is: 0.5409
Test error for KNN is: 0.6227

Train error for SVM is: 0.5056
Test error for SVM is: 0.5773

Train error for GNB is: 0.5428
Test error for GNB is: 0.6227

Train error for XGB is: 0.0001
Test error for XGB is: 0.6760

In total, 99.99% of the meta training set is classified correctly by at least one classifier
In total, 79.87% of the meta test set is classified correctly by at least one classifier
