In [2]:
from sklearn.metrics import roc_auc_score

In [3]:
from sortedcontainers import SortedList
import copy
import collections
import numpy as np
from itertools import product,chain
import pandas
from sklearn.model_selection import KFold
import catboost as cb

''' a class for doing grid search on a set of parameters provided in a dict. 'pdict' should be a dictionary like the following:
pdict = {'depth':[1,2], 'iterations':[250,100,500], 'thread_count':4}

when grid_search is called it will return an iterator that provides samples from the dictionary e.g.
{'depth':1, 'iterations':250, 'thread_count':4}
{'depth':2, 'iterations':250, 'thread_count':4}
{'depth':1, 'iterations':100, 'thread_count':4}
etc.
after calling an iteration of grid_search, you need to test the classifier and run 'register_result'
This will update the internal list of results, so that the next call to grid_search will use the best
parameters for all the parameters not currently being updated.

grid_search can be provided a list e.g. grid_search(['depth']) this will use the current best parameters for all
the other arguments and only search over 'depth'. You can then call e.g. grid_search(['iterations']) and it will use
the best depth found previously and cycle through all the 'iterations'. Searching incrementally can be much faster
than doing a full grid search, but may miss the global optimum. '''
class paramsearch:
    def __init__(self,pdict):    
        self.pdict = {}
        # if something is not passed in as a sequence, make it a sequence with 1 element
        #   don't treat strings as sequences
        for a,b in pdict.items():
            if isinstance(b, collections.Sequence) and not isinstance(b, str): self.pdict[a] = b
            else: self.pdict[a] = [b]
        # our results are a sorted list, so the best score is always the final element
        self.results = SortedList()       
                    
    def grid_search(self,keys=None):
        # do grid search on only the keys listed. If none provided, do all
        if keys==None: keylist = self.pdict.keys()
        else: keylist = keys
 
        listoflists = [] # this will be list of lists of key,value pairs
        for key in keylist: listoflists.append([(key,i) for i in self.pdict[key]])
        for p in product(*listoflists):
            # do any changes to the current best parameter set
            if len(self.results)>0: template = self.results[-1][1]
            else: template = {a:b[0] for a,b in self.pdict.items()}
            # if our updates are the same as current best, don't bother
            if self.equaldict(dict(p),template): continue
            # take the current best and update just the ones to change
            yield self.overwritedict(dict(p),template)
                              
    def equaldict(self,a,b):
        for key in a.keys(): 
            if a[key] != b[key]: return False
        return True            
                              
    def overwritedict(self,new,old):
        old = copy.deepcopy(old)
        for key in new.keys(): old[key] = new[key]
        return old            
    
    # save a (score,params) pair to results. Since 'results' is a sorted list,
    #   the best score is always the final element. A small amount of noise is added
    #   because sorted lists don't like it when two scores are exactly the same    
    def register_result(self,result,params):
        self.results.add((result+np.random.randn()*1e-10,params))    
        
    def bestscore(self):
        return self.results[-1][0]
        
    def bestparam(self):
        return self.results[-1][1]

In [4]:
def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.iloc[train_index]
        test_labels = train_label.iloc[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)

In [5]:
import numpy as np
import scipy.stats as stats

In [6]:
import xgboost as xgb
import catboost as cb

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders

In [8]:
from sklearn.model_selection import KFold
from itertools import product,chain

In [9]:
from sklearn.model_selection import RandomizedSearchCV

#### Levanto los datos de csv

In [10]:
kaggle = pd.read_csv('trocafone_kaggle_test.csv')

In [11]:
labels = pd.read_csv('labels_training_set.csv')

In [12]:
features_1 = pd.read_csv("features_08.csv")

In [13]:
processed = features_1.set_index('person')

In [14]:
# Indico que columnas son categoricas
cat_features = []
for i,col in enumerate(processed.columns):
    if processed[col].dtype.name=='object':
        cat_features.append(i)

In [15]:
#Junto los labels 
predict_features = kaggle.set_index('person').join(processed)

In [16]:
#Datos de entrenamiento
training_features = labels.set_index('person').join(processed)

#Solo las labels de entrenamiento
training_labels = training_features['label']

#Datos de entrenamiento sin labels
training_features = training_features.drop(['label'],axis=1)

#### Definicion de funciones para la grid search

In [17]:
params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'border_count':[50,5,10,20,100,200],
          'thread_count':4}

In [18]:
def catboost_param_tune(params,train_set,train_label,cat_dims=None,n_splits=3):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually 
    #   but 'iterations','learning_rate' together
    for prms in chain(ps.grid_search(['border_count']),
                      ps.grid_search(['border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth'])):
        res = crossvaltest(prms,train_set,train_label,cat_dims,n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
    return ps.bestparam()

In [19]:
#Split de datos de entrenamiento y test
xtrain,xtest,ytrain,ytest = train_test_split(training_features,training_labels)

In [None]:
bestparams = catboost_param_tune(params,training_features,training_labels,cat_features)

In [20]:
bestparams

NameError: name 'bestparams' is not defined

In [21]:
best={'depth': 3,
 'iterations': 1000,
 'learning_rate': 0.03,
 'l2_leaf_reg': 3,
 'border_count': 20,
 'thread_count': 4}

### Pruebo los parametros

In [22]:
xtrain,xtest,ytrain,ytest = train_test_split(training_features,training_labels)

In [23]:
cb_classifier = cb.CatBoostClassifier(**best)

In [24]:
cb_classifier.fit(xtrain,ytrain,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x214a87ce630>

In [25]:
cb_classifier.eval_metrics(cb.Pool(xtest,ytest,cat_features=cat_features),'AUC',plot=True)

{'AUC': [0.5775808798356727,
  0.6007844060253338,
  0.604539969188634,
  0.6193414070523793,
  0.6214733824032865,
  0.6214733824032865,
  0.8002400718931872,
  0.794359808284834,
  0.8227028414926395,
  0.826802892844916,
  0.8267194453954125,
  0.8281701472098597,
  0.8277092605272167,
  0.8298301095515235,
  0.8295519513865115,
  0.8297710544334131,
  0.8320326942827798,
  0.8314519856213626,
  0.8327751626155426,
  0.8332634371790483,
  0.8326300924340979,
  0.8331179390619651,
  0.8339387196165697,
  0.8325376583361863,
  0.8324024306744267,
  0.8326532009585759,
  0.8325795960287572,
  0.8335873844573776,
  0.8335291852105444,
  0.8345485279014035,
  0.8349293906196509,
  0.8350299554946936,
  0.8384731256419035,
  0.8471516603902773,
  0.8468281410475864,
  0.8463026360835331,
  0.8462102019856214,
  0.8463933584388907,
  0.8522090037658336,
  0.8521644984594317,
  0.8513343033207805,
  0.8509243409791167,
  0.8510698390961999,
  0.8510685552892845,
  0.8505901232454639,
  0.85

In [26]:
roc_auc_score(ytest,cb_classifier.predict_proba(xtest)[:,1])

0.8678586100650462

In [27]:
cb_classifier.fit(training_features, np.ravel(training_labels), cat_features=cat_features,verbose=0)


<catboost.core.CatBoostClassifier at 0x214a87ce630>

In [28]:
cb_classifier.save_model("preentrenados/cb_f08")

In [53]:
proba_loco=cb_classifier.predict_proba(predict_features)[:,1]

In [54]:
submit = kaggle.set_index('person')
submit['label'] = proba_loco

In [55]:
submit.to_csv('submit.csv')

In [None]:
res = clf.predict(test_set)
print('error:',1-np.mean(res==np.ravel(test_label)))