In [100]:
import numpy as np   
import time
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import ShuffleSplit
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.regularizers import l2

In [101]:
# Use the first GPU card
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Automatically grow GPU memory usage
gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

# Set the session used by Keras
tf.keras.backend.set_session(sess)


In [102]:
class PreProcessing():

    def __init__(self, path, train, test, result_name):
        self.path = path
        self.traindata = self.path+train
        self.testdata = self.path+test
        self.result_name = self.path+result_name
        
        self.to_int=0
        # the columns I want to drop
        self.d_col = ['day', 'month', 'pdays','previous']
        
    def read_csv(self, f):
        data_list = pd.read_csv(f,sep=",")
        return data_list


    # StandardScaler the data
    def normalized(self, df):
        
        mean = np.mean(df, axis=0)
        std = np.std(df, axis=0)
        var = std * std

        df_normalized = df - mean
        df_normalized = df_normalized / std

        return df_normalized


    def convert_to_value(self, df):
        # get the value type columns
        value_cols = df.describe().columns
        # get the word type columns
        word_cols = list(set(df.columns)-set(value_cols))

        # make a dictionary to every kind of type
        # ,and change entire word type df to value type
        for col in word_cols:
            get_values = set(df[col].values)
            category_dict = {v:i for i,v in enumerate(get_values)}        

            arr = []
            for value in df[col].values:
                arr.append(category_dict[value])

            df[col] = np.array(arr)

        return df


    def cross_validation_split(self, df):
        # get the data and output
        X = df.values[:, :-1]
        Y = df.values[:,-1]

        # random split data to 4 pieces, the test size is .25
        # want to use cross validation
        rs = ShuffleSplit(n_splits=10, test_size=.01)
        rs_list = rs.split(X)      

        return X, Y, rs_list


    def label2int(self, ind):
        
        if ind==0:
            self.to_int=0
        elif ind==1:
            self.to_int=1

        return self.to_int


    # make the y_label to onehot
    def ylabel_to_onehot(self, Y):
        onehotY = np.zeros((Y.shape[0],2))
        for i in range(Y.shape[0]):
            onehotY[i][self.label2int(Y[i])] = 1

        return onehotY
    
    
    # drop the columns I don't want
    def drop_col(self, df):
        df = df.drop(columns=self.d_col)
        
        return df
    
    # map the ordered category
    def process_ordered_list(self, df):
        education_map = {'unknown':0,
                         'primary':1,
                         'secondary':2,
                         'tertiary':3       
        }
        
        df['education'] = df['education'].map(education_map)
        
        return df
    
    # use onthot to process other category
    def onehot_without_y(self, df):
        onehot_cols = list(set(df.columns)-set(df.describe().columns)-set('y'))
        onehotpd = pd.get_dummies(df[onehot_cols])
        df = df.drop(columns=onehot_cols)
        df = pd.concat([onehotpd, df], axis=1)
        
        return df


In [103]:
def pre_process(p_data, csv_df):
    csv_df = p_data.drop_col(csv_df)
    
    nor_cols = csv_df.describe().columns
    csv_df[nor_cols] = p_data.normalized(csv_df[nor_cols])
    
    csv_df = p_data.process_ordered_list(csv_df)
    csv_df = p_data.onehot_without_y(csv_df)
    
    return csv_df


In [104]:
def spilt_trainset(csv_df):
    X, Y, rs_list = p_data.cross_validation_split(csv_df)

    # get the first fold split
    rsl1,rsl2 = list(rs_list)[0]

    onehotY = p_data.ylabel_to_onehot(Y)
    
    return X,onehotY,rsl1,rsl2

In [105]:
def bulid_model(X,Y,*list_index):

    li1,li2 = list_index
    batch_size = 32
    epochs = 1

    bank_input = Input(shape=(X.shape[1],))
    hidden = Dense(10, activation='relu', kernel_regularizer=l2(0.1),activity_regularizer=l2(0.1))(bank_input)
#     dr = Dropout(0.4)(hidden)
#     hidden = Dense(10, activation='relu', kernel_regularizer=l2(0.01),activity_regularizer=l2(0.01))(dr)
#     hidden = Dense(256, activation='relu', kernel_regularizer=l2(0.01),activity_regularizer=l2(0.01))(hidden)
#     hidden = Dense(128, activation='relu', kernel_regularizer=l2(0.01),activity_regularizer=l2(0.01))(hidden)
    output = Dense(2, activation='softmax')(hidden)

    model = Model(inputs=bank_input, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    history = model.fit(X[li1], Y[li1],
             batch_size=batch_size,
             epochs=epochs,
             validation_data=(X[li2], Y[li2]),
             )

    return model

In [106]:
def test(testdata, p_data, model):
    
    test_df = pre_process(p_data, testdata)

    y_prob = model.predict(test_df)
    y_classes = y_prob.argmax(axis=-1)
    sub_csv = pd.DataFrame({'id':np.arange(y_classes.shape[0]),
                            'ans':y_classes
                           },
                          columns=['id','ans'])

    
    sub_csv.to_csv(p_data.result_name, index=False)
    
    return y_classes

In [107]:
path = "/dataset/Bank Marketing Data Set/"
train_data = "training_data.csv"
test_data = "testing_data.csv"
result_name = 'sub_res2.csv'

p_data = PreProcessing(path, train_data, test_data, result_name)

traindata = p_data.read_csv(p_data.traindata)
csv_df = pre_process(p_data, traindata)
X, onehotY, rsl1, rsl2 = spilt_trainset(csv_df)
model = bulid_model(X,onehotY,rsl1,rsl2)

testdata = p_data.read_csv(p_data.testdata)
test_v = test(testdata,p_data,model)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 33)                0         
_________________________________________________________________
dense_28 (Dense)             (None, 10)                340       
_________________________________________________________________
dense_29 (Dense)             (None, 2)                 22        
Total params: 362
Trainable params: 362
Non-trainable params: 0
_________________________________________________________________
Train on 40282 samples, validate on 407 samples
Epoch 1/1


In [108]:
res_path = '/dataset/Bank Marketing Data Set/evaluate.csv'

evalution = pd.read_csv(res_path)
eva_v = evalution['ans'].values

In [111]:
from sklearn.metrics import accuracy_score

In [112]:
scores = accuracy_score(test_v, eva_v)  
scores

0.9117647058823529