In [1]:
import pandas as pd
from random import sample

In [2]:
import numpy as np
import category_encoders as ce

In [71]:
from sklearn.model_selection import RandomizedSearchCV

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier

In [5]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.neighbors import KNeighborsClassifier

In [6]:
from sklearn.model_selection import KFold

## Train y predict set

In [7]:
features_04 = pd.read_csv('features_06.csv')
features_04 = features_04.drop(['Unknown','Unknown.1'],axis=1)
labels = pd.read_csv('labels_training_set.csv')
kaggle = pd.read_csv('trocafone_kaggle_test.csv')

In [8]:
training = labels.set_index('person').join(features_04.set_index('person'),how='left')

In [9]:
training_set = training.drop('label',axis=1)
training_labels = training['label']

In [10]:
prediction_set = kaggle.set_index('person').join(features_04.set_index('person'),how='left')

In [11]:
cat = []
catnames = []
for index,v in enumerate(training_set.columns):
    if training_set[v].dtype.name == 'object':
        cat.append(index)
        catnames.append(v)

In [12]:
for i in catnames:
    training_set[i] = training_set[i].astype('category')
    prediction_set[i] = prediction_set[i].astype('category')

## Funciones

In [13]:
def agregarFeatureTrain (dataset,labels,clasif_list,n):
    to_return = []
    new_dataset = dataset.copy(deep=True)
    kf = KFold(n_splits=n)
    new_feature = []
    for clasificador in clasif_list:
        for a,b in kf.split(dataset):
            clasificador.fit(dataset.iloc[a],labels.iloc[a])
            new_labels = clasificador.predict_proba(dataset.iloc[b])[:,1]
            new_feature.extend(new_labels)
        newindex = len(new_dataset.columns)+1
        new_dataset[newindex] = new_feature
        to_return.append(newindex)
    return new_dataset.loc[:,to_return]

In [19]:
def agregarFeatureTrainCB (dataset,labels,clasif_list,n):
    to_return = []
    new_dataset = dataset.copy(deep=True)
    kf = KFold(n_splits=n)
    new_feature = []
    for clasificador in clasif_list:
        for a,b in kf.split(dataset):
            clasificador.fit(dataset.iloc[a],labels.iloc[a],cat_features=cat,verbose=0)
            new_labels = clasificador.predict_proba(dataset.iloc[b])[:,1]
            new_feature.extend(new_labels)
        newindex = len(new_dataset.columns)+1
        new_dataset[newindex] = new_feature
        to_return.append(newindex)
    return new_dataset.loc[:,to_return]

In [20]:
def agregarFeaturePredict(dataset,labels,predict,clasif_list):
    to_return = []
    new_predict = predict.copy(deep=True)
    for classifier in clasif_list:
        classifier.fit(dataset,labels)
        prediction = classifier.predict_proba(predict)[:,1]
        newindex = len(predict.columns)+1
        new_predict[newindex] = prediction
        to_return.append(newindex)
    return new_predict.loc[:,to_return]

In [21]:
def agregarFeaturePredictCB(dataset,labels,predict,clasif_list):
    to_return = []
    new_predict = predict.copy(deep=True)
    for classifier in clasif_list:
        classifier.fit(dataset,labels,cat_features=cat,verbose=0)
        prediction = classifier.predict_proba(predict)[:,1]
        newindex = len(predict.columns)+1
        new_predict[newindex] = prediction
        to_return.append(newindex)
    return new_predict.loc[:,to_return]

## Clasificadores

In [17]:

lgbmparams = {   }
cb_classifier = cb.CatBoostClassifier(iterations=400,
                                 depth=10,
                                 l2_leaf_reg=50,
                                 random_strength=0.1,
                                 bagging_temperature=0,
                                 border_count=128)
xgb_classifier = xgb.XGBClassifier(colsample_bytree = 0.606653667095343,
             gamma = 7,
             learning_rate = 0.016238752458245277,
             max_depth = 12,
             min_child_weight = 5,
             n_estimators = 346,
             subsample = 0.7847190225361189)
lgbm_classifier = lgb.LGBMClassifier(learning_rate = 0.06748371705664348,
                 max_bin = 205,
                 min_data_in_leaf = 481,
                 n_estimators = 187,
                 num_leaves = 956)

## Encoding (para xgboost)

In [18]:
enc = ce.TargetEncoder()
encoded = enc.fit_transform(np.array(training_set),np.array(training_labels))
encoded_p = enc.transform(np.array(prediction_set))

## Obtener features

In [22]:
train_set1 = agregarFeatureTrain(encoded,training_labels,[xgb_classifier],2)
predict_set1 = agregarFeaturePredict(encoded,training_labels,encoded_p,[xgb_classifier])

In [23]:
train_set2 = agregarFeatureTrain(training_set,training_labels,[lgbm_classifier],2)
predict_set2 = agregarFeaturePredict(training_set,training_labels,prediction_set,[lgbm_classifier])

In [24]:
train_set3 = agregarFeatureTrainCB(training_set,training_labels,[cb_classifier],2)
predict_set3 = agregarFeaturePredictCB(training_set,training_labels,prediction_set,[cb_classifier])

In [25]:
train_set2.columns = ['lgbm']
train_set3.columns = ['catboost']

In [26]:
predict_set2.columns = ['lgbm']
predict_set3.columns = ['catboost']

## Armar stack (training)

In [32]:
stacked_training = train_set2.join(train_set3)
stacked_training['xgboost'] = list(train_set1[train_set1.columns[0]])

In [33]:
stacked_training.head()

Unnamed: 0_level_0,lgbm,catboost,xgboost
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0566e9c1,0.104925,0.035126,0.060293
6ec7ee77,0.039298,0.049223,0.060477
abe7a2fb,0.051244,0.012723,0.030094
34728364,0.113891,0.134094,0.261684
87ed62de,0.00636,0.006881,0.010747


## Armar stack (prediction)

In [34]:
stacked_predict = predict_set2.join(predict_set3)
stacked_predict['xgboost'] = list(predict_set1[predict_set1.columns[0]])

In [35]:
stacked_predict.head()

Unnamed: 0_level_0,lgbm,catboost,xgboost
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4886f805,0.000426,0.003111,0.007877
0297fc1e,0.021861,0.083819,0.043512
2d681dd8,0.005023,0.014274,0.015421
cccea85e,0.037773,0.053141,0.102484
4c8a8b93,0.002734,0.014094,0.02163


## Prediccion final (Score:)

##### Labels de xgboost, catboost y lightgbm con Logistic Regression como modelo final

In [78]:
log = LogisticRegression()

In [37]:
stacked_training.to_csv('preentrenados/23_stacked_training.csv')

In [38]:
stacked_predict.to_csv('preentrenados/23_stacked_predict.csv')

In [39]:
xtrain,xtest,ytrain,ytest = train_test_split(stacked_training,labels['label'])

In [40]:
log.fit(xtrain,ytrain)
roc_auc_score(ytest,log.predict_proba(xtest)[:,1])

0.8668939118258074

In [41]:
log.fit(stacked_training,labels['label'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
final_pred = log.predict_proba(stacked_predict)[:,1]

In [43]:
submit = kaggle.copy(deep=True)
submit['label'] = final_pred

In [45]:
submit.set_index('person').to_csv('submit.csv')

## Usando algunas features del dataset 

In [48]:
training_set.columns

Index(['viewed product', 'searched products', 'ad campaign hit', 'staticpage',
       'checkout', 'search engine hit', 'conversion', 'generic listing',
       'brand listing', 'visited site', 'lead1', 'Excelente', 'Muito Bom',
       'Bom', 'Bom - Sem Touch ID', 'Novo', '32GB', '64GB', '128GB', '16GB',
       '8GB', '256GB', '4GB', '512MB', 'Conditions', 'AboutUs', 'how-to-sell',
       'trust-trocafone', 'how-to-buy', 'CustomerService', 'club-trocafone',
       'Quiosks', 'FaqEcommerce', 'TermsAndConditionsReturnEcommerce',
       'TermsAndConditionsEcommerce', 'galaxy-s8', 'PrivacyEcommerce',
       'black_friday', 'Google', 'Yahoo', 'Bing', 'Ask', 'Smartphone',
       'Computer', 'Tablet', 'Paid', 'Organic', 'Direct', 'Referral', 'Social',
       'Email', 'mayor_evento', 'prod_visto', 'dispos', 'compras', 'visitas',
       'cant_prod_vistos', 'cant_eventos', 'permanencia', 'hace_cuanto_visito',
       'eventos_por_dia', 'channel', 'visitas_por_dia', 'vio_celulares',
       'region',

In [49]:
fts = ['viewed product', 'searched products', 'ad campaign hit', 'staticpage',
       'checkout', 'search engine hit', 'conversion', 'generic listing',
       'brand listing', 'visited site', 'lead1', 'Excelente', 'Muito Bom',
       'Bom', 'Bom - Sem Touch ID', 'Novo', '32GB', '64GB', '128GB', '16GB',
       '8GB', '256GB', '4GB', '512MB', 'Conditions', 'AboutUs', 'how-to-sell',
       'trust-trocafone', 'how-to-buy', 'CustomerService', 'club-trocafone',
       'Quiosks', 'FaqEcommerce', 'TermsAndConditionsReturnEcommerce',
       'TermsAndConditionsEcommerce', 'galaxy-s8', 'PrivacyEcommerce',
       'black_friday', 'Google', 'Yahoo', 'Bing', 'Ask', 'Smartphone',
       'Computer', 'Tablet', 'Paid', 'Organic', 'Direct', 'Referral', 'Social',
       'Email']

In [50]:
train_f = stacked_training.join(training_set.loc[:,fts])
pred_f = stacked_predict.join(prediction_set.loc[:,fts])

In [51]:
xtrain,xtest,ytrain,ytest = train_test_split(train_f,labels['label'])

In [52]:
log.fit(xtrain,ytrain)
roc_auc_score(ytest,log.predict_proba(xtest)[:,1])

0.8497084310268147

In [214]:
log.fit(train_f,labels['label'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [215]:
n_prediction = log.predict_proba(pred_f)[:,1]

In [216]:
submit = kaggle.set_index('person').copy(deep=True)
submit['label'] = n_prediction

## Xgboost con todas las features

In [53]:
forxgboost = stacked_training.reset_index().join(encoded).set_index('person')

In [58]:
forxgboost_p = stacked_predict.reset_index().join(encoded_p).set_index('person')

In [54]:
xtrain,xtest,ytrain,ytest = train_test_split(forxgboost,labels['label'])

In [55]:
xgb_classifier.fit(xtrain,ytrain)
roc_auc_score(ytest,xgb_classifier.predict_proba(xtest)[:,1])

0.8648361869672012

In [56]:
xgb_classifier.fit(forxgboost,labels['label'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.606653667095343, gamma=7,
       learning_rate=0.016238752458245277, max_delta_step=0, max_depth=12,
       min_child_weight=5, missing=None, n_estimators=346, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7847190225361189)

In [60]:
pred = xgb_classifier.predict_proba(forxgboost_p)[:,1]

In [61]:
submit = kaggle.set_index('person')
submit['label'] = pred
submit.to_csv('submit.csv')

## Agrego KNN y RandomForest

##### KNN

In [222]:
from sklearn.preprocessing import normalize

In [228]:
normalized = pd.DataFrame(normalize(encoded,axis=0))
normalized_p = pd.DataFrame(normalize(encoded_p,axis=0))

In [229]:
knn = KNeighborsClassifier(leaf_size=15,p=3)

##### RandomForest

In [230]:
train_set_knn = agregarFeatureTrain(normalized,training_labels,[knn],4)
predict_set_knn = agregarFeaturePredict(normalized,training_labels,normalized_p,[knn])

In [233]:
rf = RandomForestClassifier(n_estimators=100)

In [234]:
train_set_rf = agregarFeatureTrain(encoded,training_labels,[rf],4)
predict_set_rf = agregarFeaturePredict(encoded,training_labels,encoded_p,[rf])

#### Armo set

In [244]:
stacked_features = train_set2.join(train_set3,how='left')
stacked_features['knn'] = list(train_set_knn[46])
stacked_features['rf'] = list(train_set_rf[46])
stacked_features['xgboost'] = list(train_set1[46])

In [245]:
stacked_features_p = predict_set2.join(predict_set3,how='left')
stacked_features_p['knn'] = list(predict_set_knn[46])
stacked_features_p['rf'] = list(predict_set_rf[46])
stacked_features_p['xgboost'] = list(predict_set1[46])

## Prediccion

In [268]:
log = LogisticRegression()

In [361]:
xtrain,xtest,ytrain,ytest = train_test_split(stacked_features,labels['label'],train_size=0.9,shuffle=True)



In [362]:
log.fit(xtrain,ytrain)
roc_auc_score(ytest,log.predict_proba(xtest)[:,1])

0.860789117435889

In [352]:
log.fit(stacked_features,labels['label'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [353]:
final_pred = log.predict_proba(stacked_features_p)[:,1]

In [354]:
submit = kaggle.copy(deep=True)
submit['label'] = final_pred

## Logistic regression tuneando parametros

In [89]:
logreg = LogisticRegression()
params = {
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
}
log = RandomizedSearchCV(logreg,param_distributions=params,verbose=0,cv=4,n_iter=5)

In [90]:
xtrain,xtest,ytrain,ytest = train_test_split(stacked_training,labels['label'])

In [91]:
log.fit(xtrain,ytrain)
roc_auc_score(ytest,log.predict_proba(xtest)[:,1])

0.8767335911186463

In [92]:
log.best_estimator_.fit(stacked_training,labels['label'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

In [93]:
final_pred = log.best_estimator_.predict_proba(stacked_predict)[:,1]

In [94]:
submit = kaggle.copy(deep=True)
submit['label'] = final_pred

In [95]:
submit.set_index('person').to_csv('submit.csv')