In [1]:
from sklearn.metrics import roc_auc_score

In [2]:
import numpy as np
import scipy.stats as stats

In [3]:
import xgboost as xgb
import catboost as cb

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders

In [5]:
from sklearn.model_selection import RandomizedSearchCV

In [6]:
kaggle = pd.read_csv('trocafone_kaggle_test.csv')

In [7]:
labels = pd.read_csv('labels_training_set.csv')

In [8]:
features_06 = pd.read_csv('features_test.csv')

In [9]:
features_04 = pd.read_csv('features_04.csv')

In [10]:
repeated = ['busco_productos', 'clickeo_ad', 'hizo_checkout', 'entro_desde_motor',
       'hizo_conversion', 'filtro_busquedas', 'busco_marca', 'visito_sitio',
       'hizo_lead','primer_visita','ultima_visita']

In [11]:
processed = features_06.set_index('person').join(features_04.set_index('person').drop(repeated,axis=1),lsuffix='1')

In [12]:
cat_features = []
for i,col in enumerate(processed.columns):
    if processed[col].dtype.name=='object':
        cat_features.append(i)

In [13]:
predict_features = kaggle.set_index('person').join(processed)

In [14]:
training_features = labels.set_index('person').join(processed)
training_labels = training_features['label']
training_features = training_features.drop(['label'],axis=1)

## Catboost normal

In [29]:
xtrain,xtest,ytrain,ytest = train_test_split(training_features,training_labels)

In [30]:
catBoost = cb.CatBoostClassifier(iterations=200,
                                 depth=8,
                                 l2_leaf_reg=50,
                                 random_strength=0.1,
                                 bagging_temperature=0,
                                 border_count=128)

In [31]:
catBoost.fit(xtrain,ytrain,verbose=0,cat_features=cat_features)

<catboost.core.CatBoostClassifier at 0x7f7842ef6898>

In [63]:
catBoost.save_model('86436catboost')

In [32]:
roc_auc_score(ytest,catBoost.predict_proba(xtest)[:,1])

0.8633825092156457

In [33]:
catBoost.fit(training_features,training_labels,verbose=0,cat_features=cat_features)

<catboost.core.CatBoostClassifier at 0x7f7842ef6898>

In [34]:
pred = catBoost.predict_proba(predict_features)[:,1]
submit = kaggle.set_index('person')
submit['label'] = pred

In [35]:
submit.head()

Unnamed: 0_level_0,label
person,Unnamed: 1_level_1
4886f805,0.006393
0297fc1e,0.059839
2d681dd8,0.01731
cccea85e,0.069601
4c8a8b93,0.025887


In [37]:
submit.to_csv('submit.csv')

## Random subsets

In [40]:
from sklearn.util,bvs.random import sample_without_replacement

In [41]:
xtrain.index.size

14560

In [57]:
cbClassifier = cb.CatBoostClassifier(iterations=200,
                                 depth=8,
                                 l2_leaf_reg=50,
                                 random_strength=0.1,
                                 bagging_temperature=0,
                                 border_count=128)
def tryRandomSubsets (xtrain,xtest,ytrain,ytest,n):
    results = []
    for i in range(n):
        indices = sample_without_replacement(n_population=14560,n_samples=6000)
        cbClassifier.fit(xtrain.iloc[indices],ytrain.iloc[indices],cat_features=cat_features,verbose=0)
        score = roc_auc_score(ytest,cbClassifier.predict_proba(xtest)[:,1])
        results.append([indices,score])
    return results


In [None]:
res = tryRandomSubsets(xtrain,xtest,ytrain,ytest,10)

In [53]:
res

[[array([ 3986,  8574, 13137, ...,  2946,  9010,  3638]), 0.8547978485515694],
 [array([ 9351,   395, 10720, ...,  3992,  5168,   384]), 0.8526390121411997],
 [array([ 5610,  9658,  8614, ..., 10373,   743,  7907]), 0.8615965080627772],
 [array([ 4469,  5892,  5121, ...,  3750,  5771, 13507]), 0.8551715428764841],
 [array([ 7190, 10659,   158, ...,  7357, 12770,  8114]), 0.8538442837224767],
 [array([ 4920, 11413,   507, ..., 14288,  3340,  6536]), 0.8580519959142754],
 [array([11730,  2677, 14374, ..., 11686,  5457, 14248]), 0.8591601928778156],
 [array([10126,  8814,  4497, ..., 13872, 12251,  5183]), 0.8651401611438656],
 [array([13099, 13968,  8270, ...,  5001,  8069,  9146]), 0.8583483741719664],
 [array([12653,  8849,  1868, ..., 13297,  9502, 10509]), 0.8538451427898901]]

In [55]:
res[7]

[array([10126,  8814,  4497, ..., 13872, 12251,  5183]), 0.8651401611438656]

In [60]:
cbClassifier.fit(xtrain.iloc[res[7][0]],ytrain.iloc[res[7][0]],cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x7f7844f48320>

In [61]:
submit = kaggle.set_index('person')
submit['label'] = cbClassifier.predict_proba(predict_features)[:,1]

In [64]:
submit.to_csv('submit.csv')

## Dropeando las unknown (score: 0.86514)

In [15]:
from sklearn.preprocessing import StandardScaler

In [18]:
cb_features = training_features.drop(['Unknown','Unknown.1'],axis=1)
cb_predict = predict_features.drop(['Unknown','Unknown.1'],axis=1)                                  

In [27]:
not_numeric = []
numeric = []
for col in cb_features.columns:
    if cb_features[col].dtype.name == 'object':
        not_numeric.append(col)
    else:
        numeric.append(col)

In [34]:
numeric_features_t = cb_features.drop(not_numeric,axis=1)
not_numeric_features_t = cb_features.drop(numeric,axis=1)

numeric_features_p = cb_predict.drop(not_numeric,axis=1)
not_numeric_features_p = cb_predict.drop(numeric,axis=1)

In [35]:
scaler = StandardScaler()
estandarizadas_t = scaler.fit_transform(numeric_features_t,training_labels)
estandarizadas_p = scaler.transform(numeric_features_p)

In [40]:
for_training = not_numeric_features_t.reset_index().join(pd.DataFrame(estandarizadas_t)).set_index('person')

In [49]:
for_predict = not_numeric_features_p.reset_index().join(pd.DataFrame(estandarizadas_p)).set_index('person')

In [43]:
xtrain,xtest,ytrain,ytest = train_test_split(for_training,training_labels,train_size=0.6)



In [44]:
cb_classifier = cb.CatBoostClassifier(iterations=400,
                                 depth=8,
                                 l2_leaf_reg=50,
                                 random_strength=0.1,
                                 bagging_temperature=0,
                                 border_count=128)

In [45]:
cat_features2 = []
for i,col in enumerate(xtrain.columns):
    if xtrain[col].dtype.name=='object':
        cat_features2.append(i)

In [47]:
cb_classifier.fit(xtrain,ytrain,cat_features=cat_features2,verbose=0)

<catboost.core.CatBoostClassifier at 0x7fbcfd97e518>

In [48]:
roc_auc_score(ytest,cb_classifier.predict_proba(xtest)[:,1])

0.8623101881526776

In [50]:
cb_classifier.fit(for_training,training_labels,cat_features=cat_features2,verbose=0)

<catboost.core.CatBoostClassifier at 0x7fbcfd97e518>

In [52]:
proba_loco=cb_classifier.predict_proba(for_predict)[:,1]

In [53]:
submit = kaggle.set_index('person')
submit['label'] = proba_loco

In [54]:
submit.to_csv('submit.csv')

In [55]:
cb_classifier.save_model('preentrenados/86514catboost(normalizado)')