In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
import numpy as np
import time

import warnings
warnings.filterwarnings("ignore")



In [1]:
def data_process(data,Smallest_Value_EINGEFUEGT_AM):
    data['EINGEFUEGT_AM'] = pd.to_datetime(data['EINGEFUEGT_AM'])
    data['EINGEFUEGT_AM'] = (data['EINGEFUEGT_AM']-Smallest_Value_EINGEFUEGT_AM)/timedelta(days=1)
    for var in ['CAMEO_DEUG_2015','CAMEO_INTL_2015']:
        numbers = [1,2,3,4,5,6,7,8,9]
        try:
            for i in numbers:
                data.loc[data[var]==i,var] = str(np.int(i))
        except:
            pass
    return data
    

In [1]:
def Diff_List(li1, li2):
    return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))

In [None]:
## For each of the category variable, create dummy variables for each value
## Only keep that value group if the mean (proportion of people in that group) of the customers and the mean of the azdias are statistically significant

def get_category_data_azdias_customers(category_var_summary,population_threshold,ratio_threshold,p_threshold):
    customers_category_use = {}
    azdias_category_use ={}

    for i in range(category_var_summary.col_name.shape[0]):
        i=int(i)
        var = category_var_summary.col_name.iloc[i]
        #if i%100==0:
            #print(i)
        #var = 'D19_VERSAND_DATUM'
        var_encoded_azdias = pd.get_dummies(azdias[var])
        var_encoded_azdias.columns = [var+'_'+str(var_encoded_azdias.columns[i]) for i in range(var_encoded_azdias.columns.shape[0])]
        var_encoded_customers = pd.get_dummies(customers[var])
        var_encoded_customers.columns = [var+'_'+str(var_encoded_customers.columns[i]) for i in range(var_encoded_customers.columns.shape[0])]

        list_sub_var_not_use = []

        for var_sub in list(set(var_encoded_customers.columns).intersection(var_encoded_azdias.columns)):
            a = var_encoded_azdias[var_sub]
            b = var_encoded_customers[var_sub]
            p_value = stats.ttest_ind(a,b)[1]
            if p_value>=p_threshold:
                list_sub_var_not_use = list_sub_var_not_use + [var_sub]


        list_sub_var_use_azdias = Diff_List(var_encoded_azdias.columns,list_sub_var_not_use)
        list_sub_var_use_customers = Diff_List(var_encoded_customers.columns,list_sub_var_not_use)

        var_encoded_azdias_M = var_encoded_azdias[list_sub_var_use_azdias].mean().reset_index().rename(columns={'index':'var_sub',0:'mean_value_azdias'})
        var_encoded_customers_M = var_encoded_customers[list_sub_var_use_customers].mean().reset_index().rename(columns={'index':'var_sub',0:'mean_value_customers'})

        MERGED_var_encoded_M = pd.merge(var_encoded_customers_M,var_encoded_azdias_M,on='var_sub',how='outer')
        MERGED_var_encoded_M = MERGED_var_encoded_M.fillna(0)
        MERGED_var_encoded_M = MERGED_var_encoded_M[(MERGED_var_encoded_M.mean_value_customers>=population_threshold)|(MERGED_var_encoded_M.mean_value_azdias>=population_threshold)]
        MERGED_var_encoded_M['diff_ratio_customers_azdias'] = abs(MERGED_var_encoded_M['mean_value_customers']/MERGED_var_encoded_M['mean_value_azdias']-1)
        MERGED_var_encoded_M = MERGED_var_encoded_M[MERGED_var_encoded_M.diff_ratio_customers_azdias>=ratio_threshold]

        customers_category_use[var] = var_encoded_customers[list(MERGED_var_encoded_M.var_sub)]
        azdias_category_use[var] = var_encoded_azdias[list(MERGED_var_encoded_M.var_sub)]


    for indx, key in enumerate(customers_category_use):
        #if indx%100==0:
            #print(indx)
        if indx==0:
            customers_category_data = customers_category_use[key]
        else:
            customers_category_data = pd.concat([customers_category_data,customers_category_use[key]],axis=1)

    for indx, key in enumerate(azdias_category_use):
        #if indx%100==0:
            #print(indx)
        if indx==0:
            azdias_category_data = azdias_category_use[key]
        else:
            azdias_category_data = pd.concat([azdias_category_data,azdias_category_use[key]],axis=1)
    
    return azdias_category_data,customers_category_data

In [None]:
def get_cluster_characteristic(Cluster_check):
    MC0 = Matrix_Check_Corr_all[Cluster_check]['is_target'].reset_index()
    MC0['Attribute'] = ['_'.join(MC0['index'].iloc[x].split('_')[:(len(MC0['index'].iloc[x].split('_'))-1)]) for x in range(MC0.shape[0])]
    summary0 = pd.merge(lookup_only_value_all[Cluster_check],MC0,on=['Attribute'])
    summary = summary0[summary0.Value!=-1]
    summary.to_csv('Summary_cluster_'+str(Cluster_check)+'.csv',index=False)
    return summary


In [2]:
def get_data(mailout_train,var_for_mailout,numeric_var,customer_category_var_summary):
    mailout_train_category_use ={}
    category_var_for_mailout = set(var_for_mailout)-set(numeric_var)
    for var in list(category_var_for_mailout):
        #print(var)
        var_encoded_mailout_train = pd.get_dummies(mailout_train[var])

        var_encoded_mailout_train.columns = [var+'_'+str(var_encoded_mailout_train.columns[i]) for i in range(var_encoded_mailout_train.columns.shape[0])]
        keep_var = set(var_encoded_mailout_train.columns).intersection(list(customer_category_var_summary.col_name))
        mailout_train_category_use[var]=var_encoded_mailout_train[keep_var]
    for indx, key in enumerate(mailout_train_category_use):
        #print(indx)
        if indx==0:
            mailout_train_category_data = mailout_train_category_use[key]
        else:
            mailout_train_category_data = pd.concat([mailout_train_category_data,mailout_train_category_use[key]],axis=1)
    mailout_train_numeric = mailout_train[numeric_var]
    for numeric_var_one in list(numeric_var):
        mailout_train_numeric.loc[mailout_train_numeric[numeric_var_one].isna()==True,numeric_var_one] = mailout_train_numeric[numeric_var_one].median()
    mailout_train_data_all = pd.concat([mailout_train_category_data,mailout_train_numeric],axis=1)
    #mailout_train_data_all = mailout_train_data_all[customers_data_all.columns]
    return mailout_train_data_all

In [None]:
def get_data2(mailout_train,var_for_mailout,numeric_var,customer_category_var_summary):
    mailout_train_category_use ={}
    category_var_for_mailout = set(var_for_mailout)-set(numeric_var)
    for var in list(category_var_for_mailout):
        #print(var)
        var_encoded_mailout_train = pd.get_dummies(mailout_train[var])
        print(var_encoded_mailout_train.shape)
        var_encoded_mailout_train.columns = [var+'_'+str(var_encoded_mailout_train.columns[i]) for i in range(var_encoded_mailout_train.columns.shape[0])]
        keep_var = set(var_encoded_mailout_train.columns).intersection(list(customer_category_var_summary.col_name))
        mailout_train_category_use[var]=var_encoded_mailout_train[keep_var]
    for indx, key in enumerate(mailout_train_category_use):
        #print(indx)
        if indx==0:
            mailout_train_category_data = mailout_train_category_use[key]
        else:
            mailout_train_category_data = pd.concat([mailout_train_category_data,mailout_train_category_use[key]],axis=1)
        
    for numeric_var_one in list(numeric_var):
        mailout_train_numeric[numeric_var_one] = mailout_train_numeric[numeric_var_one].fillna('median')
    mailout_train_data_all = pd.concat([mailout_train_category_data,mailout_train_numeric],axis=1)
    #mailout_train_data_all = mailout_train_data_all[customers_data_all.columns]
    return mailout_train_data_all,mailout_train_numeric,mailout_train_category_data