In [1]:
from sklearn.metrics import roc_auc_score

In [2]:
import numpy as np
import scipy.stats as stats

In [3]:
import xgboost as xgb
import catboost as cb

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders

In [5]:
from sklearn.model_selection import RandomizedSearchCV

In [6]:
kaggle = pd.read_csv('trocafone_kaggle_test.csv')

In [7]:
labels = pd.read_csv('labels_training_set.csv')

In [8]:
features_1 = pd.read_csv('features_09.csv')

In [9]:
processed = features_1.set_index('person')

In [10]:
cat_features = []
for i,col in enumerate(processed.columns):
    if processed[col].dtype.name=='object':
        cat_features.append(i)

In [11]:
predict_features = kaggle.set_index('person').join(processed)

In [12]:
training_features = labels.set_index('person').join(processed)
training_labels = training_features['label']
training_features = training_features.drop(['label'],axis=1)

In [13]:
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            iv = vs.iloc[:,i].tolist()
            for j in range(i+1, lcs):
                jv = vs.iloc[:,j].tolist()
                if iv == jv:
                    dups.append(cs[i])
                    break

    return dups

In [14]:
duplicate_columns(processed)

['is_new',
 'compras',
 'visitas',
 'cant_prod_vistos',
 'lead_1',
 'busquedas',
 'clicks_en_ads',
 'checkouts',
 'entradas_desde_motor',
 'listar_productos',
 'listar_marca']

## Prediccion (Score: 0.86602)

In [15]:
xtrain,xtest,ytrain,ytest = train_test_split(training_features,training_labels)

In [16]:
cb_classifier = cb.CatBoostClassifier(iterations=400,
                                     depth=8,
                                     l2_leaf_reg=100,
                                     random_strength=0.1,
                                     bagging_temperature=0,
                                     border_count=254,
                                     learning_rate=0.03)

In [17]:
cb_classifier.fit(xtrain,ytrain,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x2710e202358>

In [None]:
cb_classifier.eval_metrics(cb.Pool(xtest,ytest,cat_features=cat_features),'AUC',plot=True)

In [19]:
roc_auc_score(ytest,cb_classifier.predict_proba(xtest)[:,1])

0.8580804490636336

In [20]:
cb_classifier.fit(training_features,training_labels,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x2710e202358>

In [21]:
proba_loco=cb_classifier.predict_proba(predict_features)[:,1]

In [22]:
submit = kaggle.set_index('person')
submit['label'] = proba_loco

In [23]:
submit.to_csv('submit.csv')

In [25]:
cb_classifier.save_model('preentrenados/86602catboost')

## Dropeando columnas duplicadas (tienen los mismos valores) (dio menos en kaggle)

In [19]:
new_predict = predict_features.drop(duplicate_columns(training_features),axis=1)

In [18]:
new_features = training_features.drop(duplicate_columns(training_features),axis=1)

In [23]:
cat_features = []
for i,col in enumerate(new_features.columns):
    if new_features[col].dtype.name=='object':
        cat_features.append(i)

In [14]:
xtrain,xtest,ytrain,ytest = train_test_split(new_features,training_labels)

In [20]:
cb_classifier = cb.CatBoostClassifier(iterations=400,
                                     depth=8,
                                     l2_leaf_reg=100,
                                     random_strength=0.1,
                                     bagging_temperature=0,
                                     border_count=254,
                                     learning_rate=0.03)

In [55]:
cb_classifier.fit(xtrain,ytrain,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x1c8ef8879b0>

In [None]:
cb_classifier.eval_metrics(cb.Pool(xtest,ytest,cat_features=cat_features),'AUC',plot=True)

In [57]:
roc_auc_score(ytest,cb_classifier.predict_proba(xtest)[:,1])

0.8532810939741595

In [24]:
cb_classifier.fit(new_features,training_labels,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x1ad69d3acf8>

In [25]:
proba_loco=cb_classifier.predict_proba(new_predict)[:,1]

In [26]:
submit = kaggle.set_index('person')
submit['label'] = proba_loco

In [27]:
submit.to_csv('submit.csv')