In [2]:
from sklearn.metrics import roc_auc_score

In [3]:
import numpy as np
import scipy.stats as stats

In [4]:
import xgboost as xgb
import catboost as cb

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders

In [6]:
from sklearn.model_selection import RandomizedSearchCV

In [7]:
kaggle = pd.read_csv('trocafone_kaggle_test.csv')

In [8]:
labels = pd.read_csv('labels_training_set.csv')

In [11]:
features_1 = pd.read_csv('features_11.csv')

In [12]:
processed = features_1.set_index('person')

In [13]:
cat_features = []
for i,col in enumerate(processed.columns):
    if processed[col].dtype.name=='object':
        cat_features.append(i)

In [14]:
predict_features = kaggle.set_index('person').join(processed)

In [15]:
training_features = labels.set_index('person').join(processed)
training_labels = training_features['label']
training_features = training_features.drop(['label'],axis=1)

In [16]:
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            iv = vs.iloc[:,i].tolist()
            for j in range(i+1, lcs):
                jv = vs.iloc[:,j].tolist()
                if iv == jv:
                    dups.append(cs[i])
                    break

    return dups

In [17]:
duplicate_columns(processed)

['is_new',
 'compras',
 'visitas',
 'cant_prod_vistos',
 'lead_1',
 'busquedas',
 'clicks_en_ads',
 'checkouts',
 'entradas_desde_motor',
 'listar_productos',
 'listar_marca']

## Prediccion (Score: 0.86602)

In [18]:
xtrain,xtest,ytrain,ytest = train_test_split(training_features,training_labels)

In [19]:
cb_classifier = cb.CatBoostClassifier(iterations=400,
                                     depth=8,
                                     l2_leaf_reg=100,
                                     random_strength=0.1,
                                     bagging_temperature=0,
                                     border_count=254,
                                     learning_rate=0.03)

In [20]:
cb_classifier.fit(xtrain,ytrain,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x1f246ee2d68>

In [21]:
cb_classifier.eval_metrics(cb.Pool(xtest,ytest,cat_features=cat_features),'AUC',plot=True)

{'AUC': [0.7008968533824554,
  0.703185172690462,
  0.7110238265446824,
  0.7172207630170473,
  0.7209116150054486,
  0.7242539534642959,
  0.7284669063868563,
  0.751694150595329,
  0.7547044864088648,
  0.7659501835385591,
  0.76584888703253,
  0.7958737069077078,
  0.8012950780607833,
  0.8080980086088643,
  0.8079596741733179,
  0.80826400993152,
  0.8188336532874956,
  0.8191540536898256,
  0.821775268123373,
  0.8225133492730302,
  0.8223767997979425,
  0.8221728680655402,
  0.8258793384579548,
  0.8249788259065591,
  0.8268753463938889,
  0.8269284489675343,
  0.82667364586206,
  0.8272403708076856,
  0.8277017830862501,
  0.8283367827694197,
  0.8319441878563785,
  0.8355069689318708,
  0.8356868036980811,
  0.8358635147834887,
  0.8363079699376958,
  0.8360812799594457,
  0.8361830227055895,
  0.8378086754433172,
  0.8375846629057551,
  0.8384650946519907,
  0.8396503083965432,
  0.8417668252604034,
  0.841433483894748,
  0.8411170996534499,
  0.840899334477493,
  0.8413259400

In [22]:
roc_auc_score(ytest,cb_classifier.predict_proba(xtest)[:,1])

0.8610720651010778

In [23]:
cb_classifier.fit(training_features,training_labels,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x1f246ee2d68>

In [27]:
proba_loco=cb_classifier.predict_proba(predict_features)[:,1]

In [28]:
submit = kaggle.set_index('person')
submit['label'] = proba_loco

In [29]:
submit.to_csv('submit.csv')

In [30]:
cb_classifier.save_model('preentrenados/86602catboost')

## Dropeando columnas duplicadas (tienen los mismos valores) (dio menos en kaggle)

In [19]:
new_predict = predict_features.drop(duplicate_columns(training_features),axis=1)

In [18]:
new_features = training_features.drop(duplicate_columns(training_features),axis=1)

In [23]:
cat_features = []
for i,col in enumerate(new_features.columns):
    if new_features[col].dtype.name=='object':
        cat_features.append(i)

In [14]:
xtrain,xtest,ytrain,ytest = train_test_split(new_features,training_labels)

In [20]:
cb_classifier = cb.CatBoostClassifier(iterations=400,
                                     depth=8,
                                     l2_leaf_reg=100,
                                     random_strength=0.1,
                                     bagging_temperature=0,
                                     border_count=254,
                                     learning_rate=0.03)

In [55]:
cb_classifier.fit(xtrain,ytrain,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x1c8ef8879b0>

In [None]:
cb_classifier.eval_metrics(cb.Pool(xtest,ytest,cat_features=cat_features),'AUC',plot=True)

In [57]:
roc_auc_score(ytest,cb_classifier.predict_proba(xtest)[:,1])

0.8532810939741595

In [24]:
cb_classifier.fit(new_features,training_labels,cat_features=cat_features,verbose=0)

<catboost.core.CatBoostClassifier at 0x1ad69d3acf8>

In [25]:
proba_loco=cb_classifier.predict_proba(new_predict)[:,1]

In [31]:
submit = kaggle.set_index('person')
submit['label'] = proba_loco

In [32]:
submit.to_csv('submit.csv')