# Introduction

**Merchants sometimes run big promotions (e.g., discounts or cash coupons) on particular dates (e.g., Boxing-day Sales, "Black Friday" or "Double 11 (Nov 11th)" , in order to attract a large number of new buyers. Unfortunately, many of the attracted buyers are one-time deal hunters, and these promotions may have little long lasting impact on sales. To alleviate this problem, it is important for merchants to identify who can be converted into repeated buyers. By targeting on these potential loyal customers, merchants can greatly reduce the promotion cost and enhance the return on investment (ROI). It is well known that in the field of online advertising, customer targeting is extremely challenging, especially for fresh buyers. However, with the long-term user behavior log accumulated by Tmall.com, we may be able to solve this problem. In this challenge, we provide a set of merchants and their corresponding new buyers acquired during the promotion on the "Double 11" day. Your task is to predict which new buyers for given merchants will become loyal customers in the future. In other words, you need to predict the probability that these new buyers would purchase items from the same merchants again within 6 months. a data set containing around 200k users is given for training, while the other of similar size for testing. Similar to other competitions, you may extract any features, then perform training with additional tools. You need to only submit the prediction results for evaluation.**

In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
# from mlxtend.classifier import StackingClassifier
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
import catboost as cat

# Part1: Read data

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
user_info=pd.read_csv('taobao/data_format1/user_info_format1.csv')
user_info=reduce_mem_usage(user_info)
user_log=pd.read_csv('taobao/data_format1/user_log_format1.csv')
user_log =reduce_mem_usage(user_log)
user_log.rename(columns={'seller_id': 'merchant_id'}, inplace=True)
train= pd.read_csv('taobao/data_format1/train_format1.csv')
train=reduce_mem_usage(train)
# train.rename(columns={'merchant_id': 'seller_id'}, inplace=True)  
test= pd.read_csv('taobao/data_format1/test_format1.csv')
test=reduce_mem_usage(test)
# test.rename(columns={'merchant_id': 'seller_id'}, inplace=True)

In [None]:
#fill na
missingIndex = user_log[user_log.brand_id.isnull()].index
sellerMode = user_log.groupby(['merchant_id']).apply(lambda x: x.brand_id.mode()[0]).reset_index()
pickUP = user_log.loc[missingIndex]
pickUP = pd.merge(pickUP, sellerMode, how='left', on=['merchant_id'])[0].astype('float32')
pickUP.index = missingIndex
user_log.loc[missingIndex, 'brand_id'] = pickUP
del pickUP, sellerMode, missingIndex
gc.collect()

In [None]:
user_info.age_range.fillna(user_info.age_range.median(),inplace=True)
user_info.gender.fillna(user_info.gender.mode()[0],inplace=True)

# Part2: Get Features

In [None]:
import datetime
import numpy as np
importt pandas as pd
from scipy import sparse
from collections import Counter
import time
import gc

class dataFeature(object):

    def __init__(self,u_log,train,test):
        # self.u_info = u_info
        self.u_log = u_log
        self.train = train
        self.test = test
    
    def summary(self,key,gbname,pname,prefix,operator,dummy,ifratio):
        # user_info = self.u_info
        user_log = self.u_log
        train = self.train
        test = self.test
        if operator==None:
            if not dummy:
                if prefix == None:# count
                    df = user_log.groupby(key).size().reset_index().rename(columns={0:pname})
                else: # unique
                    df = user_log.groupby(key).agg({gbname: lambda x: len(set(x))}).reset_index().rename(columns={gbname: pname})
            else:
                df = user_log.groupby(key +[gbname]).size().reset_index().rename(columns= {0: pname})
                df = pd.get_dummies(df, columns=[gbname], prefix=prefix)
                df = df.apply(pd.to_numeric, downcast='unsigned')
                columns = [i for i in df.columns.tolist() if prefix in i]
                for col in columns:
                    df[col] *= df[pname]
                df = df.groupby(key).sum().reset_index().drop([pname], axis=1)

        else:
            if not dummy:
                if prefix != None: #mean max min
                    df = user_log.groupby(key + [gbname]).size().reset_index().rename(columns={0: pname})
                    df = df.groupby(key).agg({pname: operator}).reset_index()
                    df.columns = key + [ prefix + 'count', prefix + 'mean', prefix + 'max', prefix + 'min']
                else:
                    df = user_log.groupby(key).agg({gbname: operator}).reset_index()
                    df.rename(columns={gbname: pname}, inplace=True)
            else:
                if not ifratio:#get_dummies
                    df = user_log.groupby([key[0]] + [gbname]).agg({key[1]: operator}).reset_index().rename(columns=
                                                                                      {key[1]: pname})
                    df = pd.get_dummies(df, columns=[gbname], prefix=prefix)
                    df = df.apply(pd.to_numeric, downcast='unsigned')
                    columns = [i for i in df.columns.tolist() if prefix in i]
                    for col in columns:
                        df[col] *= df[pname]
                    df = df.groupby(key[0]).sum().reset_index().drop([pname], axis=1)
                else:   #ratio 
                    df = user_log.groupby(key).agg({gbname:lambda x:len(set(x))}).reset_index().rename(columns={gbname: 'all_cnt'})
                    df = df.merge(user_log.groupby(key).agg({gbname: operator}).reset_index(), on=key, how='left',copy=False)
                    df[pname] = df[gbname] / (df['all_cnt'] + 10)
                    df[pname] = df[pname].astype('float32')
                    columns = df[key[1]].unique().tolist()
                    print(df.dtypes)
                    df = pd.get_dummies(df, columns=[key[1]], prefix=prefix)
                    for col in columns:
                        df[prefix +'_'+ str(col)] *= df[pname]
                    df = df.groupby(key[0]).sum().reset_index().drop(['all_cnt', gbname, pname], 1)

        if  operator==None:
            train = train.merge(df, on=key, how='left', copy=False)
            test = test.merge(df, on=key, how='left', copy=False)
            self.train = train
            self.test = test
        else:
            if not dummy:
                train = train.merge(df, on=key, how='left', copy=False)
                test = test.merge(df, on=key, how='left', copy=False)
                self.train = train
                self.test = test
            else:
                if not ifratio:
                    train = train.merge(df, on=key[0], how='left', copy=False)
                    test = test.merge(df, on=key[0], how='left', copy=False)
                    self.train = train
                    self.test = test
                else:
                    train = train.merge(df, on=key, how='left', copy=False)
                    test = test.merge(df, on=key, how='left', copy=False)
                    self.train = train
                    self.test = test
        

if __name__ == '__main__':

    feature = dataFeature(user_log,train,test)
    #active on everyn day   
    #feature.summary('user_id','time_stamp','cnt','user_every_day_cnt',None,1,0)

    # the sum of action from every user
    feature.summary(['user_id'],None,'user_id_cnt',None,None,0,0)
    print('over0')

    #action in every month
    feature.summary(['user_id'],'month','user_month_cnt','um',None,1,0)
    feature.summary(['merchant_id'], 'month', 'merchant_month_cnt', 'mm',None, 1, 0)
    print('over1')

    # the sum of action got by every merchant
    feature.summary(['merchant_id'],None,'merchant_id_cnt',None,None,0,0)
    print('over2')

    # every user active on every merchant
    feature.summary(['user_id','merchant_id'],None,'user_id_merchant_id_cnt',None,None,0,0)
    print('over3')

    # the sum of actions got by every merchant in different gender or age
    feature.summary(['merchant_id','gender'], None, 'merchant_gender_cnt', None, None, 0,0)
    feature.summary(['merchant_id', 'age_range'], None, 'merchant_age_cnt', None, None, 0, 0)
    print('over4')

    # the unique of actions from every user on different item/cat/brand/merchant
    feature.summary(['user_id'],'item_id','user_query_item_id_cnt',1,None,0,0)
    feature.summary(['user_id'], 'cat_id', 'user_query_cat_id_cnt', 1, None, 0,0)
    feature.summary(['user_id'], 'merchant_id', 'user_query_merchant_id_cnt', 1, None, 0,0)
    feature.summary(['user_id'], 'brand_id', 'user_query_brand_id_cnt', 1, None, 0,0)
    print('over5')

    # consider of time
    def timediff(t):
        delta = datetime.datetime.strptime(str(max(t)), '%m%d') - datetime.datetime.strptime(str(min(t)), '%m%d')
        return delta.days

    feature.summary(['user_id'],'time_stamp','user_time_diff',None,timediff,0,0)
    feature.summary(['merchant_id'], 'time_stamp', 'merchant_time_diff', None, timediff, 0, 0)
    print('over6')

    
    feature.summary(['user_id', 'merchant_id'], 'time_stamp', 'user_merchant_time_diff', None, timediff, 0,0)
    print('over7')
    # how many months the user active on tne merchant
    feature.summary(['user_id', 'merchant_id'],'month','user_merchant_month_cnt',1,None,0,0)
    print('over8')
    # how many days the user active on tne merchant
    feature.summary(['user_id', 'merchant_id'],'time_stamp','user_merchant_day_cnt',1,None,0,0)

    # every month the user active on the merchant
    feature.summary(['user_id', 'merchant_id'],'month','cnt','month_act',None,1,0)
    print('over9')
    # the sum of active got by every merchant in different gender and  age
    feature.summary(['merchant_id', 'gender', 'age_range'],None,'merchant_gender_age_cnt',None,None,0,0)
    print('over10')
    # the item/cat/brand/user of the merchant got max min mean 
    feature.summary(['merchant_id'], 'item_id', 'cnt', 'merchant_item_', [np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['merchant_id'], 'cat_id', 'cnt', 'merchant_cat_', [np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['merchant_id'], 'brand_id', 'cnt', 'merchant_brand_', [np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['merchant_id'], 'user_id', 'cnt', 'merchant_user_', [np.size, np.mean, np.max, np.min], 0,0)
    print('over11')
    # every user active on every merchant's item
    feature.summary(['user_id', 'merchant_id'],'item_id','cnt','user_merchant_item_',
                     [np.size, np.mean, np.max, np.min],0,0)
    # print('over12')
    # every user active on every merchant's cat
    feature.summary(['user_id', 'merchant_id'], 'cat_id', 'cnt', 'user_merchant_cat_',
                 [np.size, np.mean, np.max, np.min], 0,0)
    # print('over14')
    # every user active on every merchant's brand
    feature.summary(['user_id', 'merchant_id'], 'brand_id', 'cnt', 'user_merchant_brand_',
                    [np.size, np.mean, np.max, np.min], 0,0)
    # print('over14')
    # different actions of  every user active on every merchant
    feature.summary(['user_id', 'merchant_id'],'action_type','cnt','um_action',None,1,0)
    # print('over15')
    # the ratio of action
    train = feature.train
    test = feature.test
    train['um_action_0_ratio'] = train['um_action_0'] / (train['um_action_0'] + train['um_action_1'] +
                                                         train['um_action_2'] + train['um_action_3'] + 10)
    
    train['um_action_1_ratio'] = train['um_action_1'] / (train['um_action_0'] + train['um_action_1'] +
                                                         train['um_action_2'] + train['um_action_3'] + 10)
    
    train['um_action_2_ratio'] = train['um_action_2'] / (train['um_action_0'] + train['um_action_1'] +
                                                         train['um_action_2'] + train['um_action_3'] + 10)
    
    train['um_action_3_ratio'] = train['um_action_3'] / (train['um_action_0'] + train['um_action_1'] +
                                                         train['um_action_2'] + train['um_action_3'] + 10)
    
    test['um_action_0_ratio'] = test['um_action_0'] / (test['um_action_0'] + test['um_action_1'] +
                                                       test['um_action_2'] + test['um_action_3'] + 10)
    
    test['um_action_1_ratio'] = test['um_action_1'] / (test['um_action_0'] + test['um_action_1'] +
                                                       test['um_action_2'] + test['um_action_3'] + 10)
    
    test['um_action_2_ratio'] = test['um_action_2'] / (test['um_action_0'] + test['um_action_1'] +
                                                       test['um_action_2'] + test['um_action_3'] + 10)
    
    test['um_action_3_ratio'] = test['um_action_3'] / (test['um_action_0'] + test['um_action_1'] +
                                                       test['um_action_2'] + test['um_action_3'] + 10)

    feature.summary(['user_id'], 'item_id', 'cnt', 'user_item_', [np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['user_id'], 'cat_id', 'cnt', 'user_cat_', [np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['user_id'], 'brand_id', 'cnt', 'user_brand_',[np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['user_id'], 'merchant_id', 'cnt', 'user_merchant_', [np.size, np.mean, np.max, np.min], 0,0)
    print('over16')
    # the ratio of every user's action on different merchant
    train = feature.train
    test = feature.test
    train['um_u_ratio'] = train['user_id_merchant_id_cnt'] / train['user_id_cnt']
    test['um_u_ratio'] = test['user_id_merchant_id_cnt'] / test['user_id_cnt']
    train['um_m_ratio'] = train['user_id_merchant_id_cnt'] / train['merchant_id_cnt']
    test['um_m_ratio'] = test['user_id_merchant_id_cnt'] / test['merchant_id_cnt']
    print('over17')
   

    feature.summary(['merchant_id'],'action_type','cnt','merchant_action_cnt',None,1,0)
    feature.summary(['user_id'], 'action_type', 'cnt', 'user_action_cnt', None, 1, 0)

    print('over18')
    # consder merchant_id, gender, age_range
    feature.summary(['merchant_id', 'gender', 'age_range'],'item_id','cnt','merchant_gender_age_item_id_',
                    [np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['merchant_id', 'gender', 'age_range'], 'brand_id', 'cnt', 'merchant_gender_age_brand_id_',
                    [np.size, np.mean, np.max, np.min], 0,0)
    feature.summary(['merchant_id', 'gender', 'age_range'], 'cat_id', 'cnt', 'merchant_gender_age_cat_id_',
                    [np.size, np.mean, np.max, np.min], 0,0)
    print('over19')
    
    # every month how many unique user active on merchant
    def count(x):
        return len(set(x))
   
    feature.summary(['merchant_id','user_id'],'month','cnt','merchant_month_user_cnt',count,1,0)
    train = feature.train
    test = feature.test

    # just consider of buy
    feature = dataFeature(user_log[user_log['action_type']==2],train,test)

    feature.summary(['merchant_id'], 'time_stamp', 'merchant_buy_time_diff', None, timediff, 0, 0)

    feature.summary(['merchant_id'], 'month', 'merchant_month_cnt', 'merchant_month', None, 1, 0)
    print('over20')
    
    # rebuy
    def rebuy(x):
        return len([i[0] for i in Counter(x).items() if i[1] > 1])
    feature.summary(['merchant_id'],'user_id','merchant_allbuy_user_cnt',1,None,0,0)
    feature.summary(['merchant_id'],'user_id','merchant_repeat_buy_user_cnt',None,rebuy,0,0)
    # feature.summary(['merchant_id', 'cat_id'], 'user_id', 'merchant_cat_repeat_ratio', 'repeat_ratio',rebuy, 1,1)
    print('over21')
    
    #how many people rebuy
    feature.summary(['user_id'],'merchant_id','user_repeat_buy_cnt',None,rebuy,0,0)
    train = feature.train
    test = feature.test
    train['user_repeat_buy_ratio'] = train['user_repeat_buy_cnt']/train['user_action_cnt_2']
    test['user_repeat_buy_ratio'] = test['user_repeat_buy_cnt']/test['user_action_cnt_2']
     
    # if somebody rebuy , get the times 
    def rebuyuser(x):
        return {i[0]: i[1] for i in Counter(x).items() if i[1] > 1}
        
    feature.summary(['merchant_id'],'user_id','repeat_user_list',None,rebuyuser,0,0)
    print('over22')
    def extra_user_repeat_cnt(x):
        user_id = x['user_id']
        repeat_user_list = x['repeat_user_list']
        try:
            return repeat_user_list[user_id]
        except:
            return 0
    
    train = feature.train
    test = feature.test
    train['usr_repeat_cnt'] = train[['user_id', 'repeat_user_list']].apply(extra_user_repeat_cnt, axis=1)
    test['usr_repeat_cnt'] = test[['user_id', 'repeat_user_list']].apply(extra_user_repeat_cnt, axis=1)
    train.to_csv('taobao/train.csv',index=False)
    test.to_csv('taobao/test.csv',index=False)

**it will take about 2 hours ~~~ it is a good idea to save the results**

# Part3: Embedding

In [None]:
from tqdm import tqdm
from gensim.models import Word2Vec
def emb(data, f2):
    emb_size =5
    tmp = data.groupby(['user_id'], as_index=False)[f2].agg({'{}_list'.format(f2): list})
    sentences = tmp['{}_list'.format(f2)].values.tolist()
    del tmp['{}_list'.format(f2)]
    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]
    model = Word2Vec(sentences, size=emb_size, window=2, min_count=1, sg=0, hs=1, seed=2019)
    emb_matrix = []
    for seq in tqdm(sentences):
        vec = []
        for w in seq:
            if w in model:
                vec.append(model[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)
    emb_matrix = np.array(emb_matrix)
    for i in range(emb_size):
        tmp['{}_emb_{}'.format(f2, i)] = emb_matrix[:, i]
    del model, emb_matrix, sentences
    return tmp


for feat in ['item_id','cat_id','brand_id']:
    t=emb(user_log,feat)
    train=pd.merge(train,t, on=['user_id'], how='left')
    predict=pd.merge(predict,t, on=['user_id'], how='left')

**get the seq of click, deepwalk is also a good try**

# Part4:Under sample

In [None]:
def lower_sample_data_by_sample(df,percent=8):
    most_data = df[df['label'] == 0]  
    minority_data = df[df['label'] == 1]  
    lower_data=most_data.sample(n=int(percent*len(minority_data)),replace=False,random_state=0,axis=0)
    new_data=pd.concat([lower_data,minority_data])
    return new_data 
train=lower_sample_data_by_sample(train)
train=train.reset_index(drop=True)

**over sample is also a good try**

# Part5:Train the model and merge

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier,Dataset
from xgboost import XGBClassifier,DMatrix
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [None]:
train = train.fillna(-1)
test = test.fillna(-1)
train[['age_range', 'gender']] = train[['age_range', 'gender']].astype('int8')
test[['age_range', 'gender']] = test[['age_range', 'gender']].astype('int8')
label = train['label']
trainNew = train.drop(['label'],axis=1)
test.drop('prob',axis=1,inplace=True)

In [None]:
cat_features = ['user_id','merchant_id','age_range', 'gender']
def trainData(train_df,label_df):
        skv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2009)
        trainX = []
        trainY = []
        testX = []
        testY = []
        for train_index, test_index in skv.split(X=train_df, y=label_df):
            train_x, train_y, test_x, test_y = train_df.iloc[train_index, :], label_df.iloc[train_index], \
                                               train_df.iloc[test_index, :], label_df.iloc[test_index]

            trainX.append(train_x)
            trainY.append(train_y)
            testX.append(test_x)
            testY.append(test_y)
        return trainX,trainY,testX,testY

trainX,trainY,testX,testY = trainData(trainNew,label)

In [None]:
pred_xgbs = []
pred_result = test[['merchant_id','user_id']]

for i in range(5):
        # xgb = XGBClassifier(n_estimators=2000, max_depth=5, learning_rate=0.02, 
        #                   eval_metric='auc', reg_lambda=1, random_state=10, n_jobs=8)
        xgb = XGBClassifier(n_estimators=2000,learning_rate=0.008,eval_metric='auc',random_state=111)
        xgb.fit(trainX[i],trainY[i],eval_set=[(testX[i],testY[i])],early_stopping_rounds=200,eval_metric='auc')
        print(xgb.evals_result_)
        pred = xgb.predict_proba(test, ntree_limit = xgb.best_iteration)[:,1]
        pred_xgbs.append(pred)

In [None]:
pred_cats = []
for i in range(5):
        # cat = CatBoostClassifier(learning_rate=0.02, iterations=5000, eval_metric='AUC', od_wait=50,
        #                          od_type='Iter', random_state=10, thread_count=8, l2_leaf_reg=1)
        cat = CatBoostClassifier(learning_rate=0.008, iterations=2000, eval_metric='AUC', random_state=198)
        cat.fit(trainX[i], trainY[i], eval_set=[(testX[i], testY[i])], early_stopping_rounds=100,
                use_best_model=True,cat_features=cat_features)
        print(cat.evals_result_)
        pred = cat.predict_proba(test, ntree_end=cat.best_iteration_)[:, 1]
        pred_cats.append(pred)

In [None]:
pred_lgbms= []
for i in range(5):
        # lgbm = LGBMClassifier(n_estimators=2000,objective='binary',num_leaves=31,max_depth=5,learning_rate=0.02,
        #                       reg_lambda=1,metric=['auc'], random_state=10,n_jobs=-1)
        lgbm = LGBMClassifier(n_estimators=1000,learning_rate=0.02,metric=['auc'], random_state=20)
        lgbm.fit(trainX[i],trainY[i],eval_set=[(testX[i],testY[i])],early_stopping_rounds=100,eval_metric='auc',
                 categorical_feature=cat_features)
        print(lgbm.evals_result_)
        pred = lgbm.predict_proba(test,num_iteration=lgbm.best_iteration_)[:,1]
        pred_lgbms.append(pred)

**as you know lgb cost the least time and get the greatest score, the combination of this three models always promote it**

In [None]:
def sigmoid_ver(x):
        return np.log(x/(1-x))
def sigmoid(x):
        return 1/(1 + np.e**(-x))

pred_t = np.zeros(len(predict))
    
for i in range(5):
        pred_t += (sigmoid_ver(pred_lgbms[i])  + sigmoid_ver(pred_xgbs[i])+ sigmoid_ver(pred_cats[i])

        
result = sigmoid(pred_t/15)
pred_result['prob'] = result
pred_result[['user_id','merchant_id','prob']].to_csv('submission.csv', index=False)

# Part6:Stacking

In [None]:
from sklearn import datasets
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip(
    [clf1, clf2, clf3, sclf],
    ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, train_x , train_y, cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))