# Final version of the classifiers and meta classifier script


### Parameter settings

First, parameters are set. The parameters are obtained from a crossvalidation performed for each seperate classifier. 

In [1]:
# Attributes from dataset that we use for classification

nAttributes = ['Age', 'popularity resquer id', 'Breed1', 'img_pixels', 'PhotoAmt', 'Sterilized', 'description length',
               'img_ave_contrast', 'Breed2', 'Quantity', 'Gender', 'img_metadata_sentiment2', 'beaut', 'MaturitySize',
               'State',  'Color3', 'vaccin', 'abandon', 'Vaccinated', 'Fee', 'indoor', 'cute', 'great']


max_depth = 8  #max depth of decision tree
n_estimators = 10  # number of trees in random forest
tol = 0.01  #tolerance in (gradient/line) search in Support Vector Machine classifier, Logistic Regression
nn = 15  # number of neighbors for K-Nearest Neighbor classifier
xgb_params = {  #parameters for XGBoost
    'eval_metric': 'rmse',
    'seed': 1337,
    'verbosity': 0,
}   

### Some additional pre-processing

Data is loaded and relevant attributes are obtained. A gaussian transform and normalisation transform is applied to the appropriate variables. Dataset is also converted with dummy variables for categorical attributes so that Logistic Regression can be performed. 

### Meta train/test set

The train data is divided into a meta train and meta test set, to train and test the meta classifier respectively. 10% of the train data is used for meta testing, which helps select the best model for meta classifying.  

In [2]:
import pandas as pd
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

from sklearn import model_selection
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PowerTransformer
from sklearn import preprocessing



data = pd.read_csv('Data/preprocessedTrain3.csv') #import data
X = data.loc[:, data.columns != 'AdoptionSpeed'] #create X without labels
X = X.fillna(0)
X = X.drop('Description',axis=1) #drop non numerical values
X = X.drop('PetID',axis=1) #
X = X.drop('RescuerID',axis=1)
X = X.drop('Unnamed: 0',axis=1)
X = X.drop('Unnamed: 0.1',axis=1)
X = X.drop('img_metadata_label',axis=1)
X = X[nAttributes]
y = data['AdoptionSpeed'] #label vector

test = pd.read_csv('Data/preprocessedtest3.csv')


X_test = test.drop('Description',axis=1) #drop non numerical values
X_test = X_test.fillna(0)
id = X_test['PetID']
X_test = X_test.drop('PetID',axis=1) #
X_test = X_test.drop('RescuerID',axis=1)
X_test = X_test.drop('Unnamed: 0',axis=1)
X_test = X_test.drop('Unnamed: 0.1',axis=1)

X_test = X_test.drop('img_metadata_label',axis=1)
X_test = X_test[nAttributes]


non_zer0 = np.mean(X==0)==0
zero = non_zer0[non_zer0.values==False].index
non_zer0 = non_zer0[non_zer0.values==True].index

scaler = preprocessing.PowerTransformer(method='box-cox', standardize=True).fit(X[non_zer0])
X[non_zer0] = scaler.transform(X[non_zer0])
X_test[non_zer0] = scaler.transform(X_test[non_zer0])
scaler = preprocessing.StandardScaler().fit(X[zero])
X[zero] = scaler.transform(X[zero])
X_test[zero] = scaler.transform(X_test[zero])

meta_train, meta_test, meta_y_train, meta_y_test = model_selection.train_test_split(X,y,test_size=0.1,stratify=y)



Xlr_train = meta_train
Xlr_m_test = meta_test
Xlr_test = X_test
dummy = ['State','Type','Breed1','Breed2','Gender','Color1','Color2','Color3','Vaccinated','Dewormed','Sterilized']
for d in dummy:
    if(d in nAttributes):
        
        train = pd.get_dummies(Xlr_train[d],prefix=d)
        test = pd.get_dummies(Xlr_test[d],prefix=d)
        m_test = pd.get_dummies(Xlr_m_test[d],prefix=d)
        result = set(list(train))
        result.intersection_update(list(test))
        result.intersection_update(list(m_test))
        one_hottr = train[list(result)]
        one_hot = test[list(result)]
        one_hotm = m_test[list(result)]
        Xlr_train = Xlr_train.drop(d,axis = 1)
        # Join the encoded df
        Xlr_train = Xlr_train.join(one_hottr)
        
        Xlr_test = Xlr_test.drop(d,axis = 1)
        Xlr_test = Xlr_test.join(one_hot)
        Xlr_m_test = Xlr_m_test.drop(d,axis=1)
        Xlr_m_test = Xlr_m_test.join(one_hotm)

  llf -= N / 2.0 * np.log(np.sum((y - y_mean)**2. / N, axis=0))
  return self.partial_fit(X, y)


## Classifiers

The following classifiers were trained on the meta train data:

+ Decision Tree classifier
+ Random Forest classifier
+ Logistic Regression
+ Support Vector Machine classifier
+ K-nearest Neighbor classifier
+ Naive Bayes classifier
+ XG Boost


The predictions 