In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn import model_selection

In [53]:
def DataPreprocessing():
    # Merge tag to operation and transaction
    print("Merge tag to operation and transaction ...")
    uid_tag_dict = dict(zip(tag["UID"], tag["Tag"]))
    operation_train["tag"] = operation_train["UID"].map(uid_tag_dict)
    transaction_train["tag"] = transaction_train["UID"].map(uid_tag_dict)
    
    # Fillna 
    print("Fillna ...")
    operation_train.fillna(operation_train.mode().iloc[0], inplace=True)
    transaction_train.fillna(transaction_train.mode().iloc[0], inplace=True)
    operation_test.fillna(operation_test.mode().iloc[0], inplace=True)
    transaction_test.fillna(transaction_test.mode().iloc[0], inplace=True)
    
    # Label encode
    print("Operation label encode ...")
    operations = [operation_train, operation_test]
    for operation in operations:
        time = [t.split(":") for t in operation["time"]]
        operation["time"] = [int(h) * 3600 + int(m) * 60 + int(s) for (h, m, s) in time]
        categorical_features = ['mode', 'os', 'version', 'device1', 'device2', 'device_code1', 'device_code2', 'device_code3', 
                        'mac1', 'mac2', 'ip1', 'ip2', 'wifi', 'geo_code', 'ip1_sub', 'ip2_sub']
        le = preprocessing.LabelEncoder()
        for feature in categorical_features:
            operation.loc[:, feature] = le.fit_transform(operation.loc[:, feature])
     
    print("Transaction label encode ...")
    transactions = [transaction_train, transaction_test]
    for transaction in transactions:
        time = [t.split(":") for t in transaction["time"]]
        transaction["time"] = [int(h) * 3600 + int(m) * 60 + int(s) for (h, m, s) in time]
        categorical_features = ['channel', 'amt_src1', 'merchant', 'code1', 'code2', 'trans_type1', 'acc_id1', 'device_code1',
                                'device_code2', 'device_code3', 'device1', 'device2', 'mac1', 'ip1', 'amt_src2', 'acc_id2', 
                                'acc_id3', 'geo_code', 'trans_type2', 'market_code', 'market_type', 'ip1_sub']
        le = preprocessing.LabelEncoder()
        for feature in categorical_features:
            transaction.loc[:, feature] = le.fit_transform(transaction.loc[:, feature])
            
    operation_x_train, operation_y_train = operations[0].iloc[:, :-1].values, operations[0].iloc[:, -1].values
    operation_x_test = operations[1].values
    
    transaction_x_train, transaction_y_train = transactions[0].iloc[:, :-1].values, transactions[0].iloc[:, -1].values
    transaction_x_test = transactions[1].values
    
    return operation_x_train, operation_y_train, operation_x_test, transaction_x_train, transaction_y_train, transaction_x_test

In [54]:
def tpr_weight_function(y_true,y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

In [55]:
def ModelEvaluation(model, x, y):
    kf = model_selection.KFold(n_splits = 3, random_state = 2018)
    scores = []
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        score = tpr_weight_function(y_test, y_pred)
        scores.append(score)
    
    return np.mean(scores)

In [64]:
def Fit(model, x_train, y_train):
    model.fit(x_train, y_train)
    return model

In [63]:
def Predict(model, x_test):
    y_pred = model.predict_proba(x_test)[:, 1]
    return y_pred

In [68]:
def GetResult(x_test, y_pred):
    result = pd.DataFrame()
    result["UID"] = x_test["UID"]
    result["Tag"] = y_pred
    return result

In [115]:
def GetSubmission(operation_result, transaction_result):
    result = pd.concat([operation_result, transaction_result], axis = 0)
    result = result.groupby("UID").mean()
    submission = pd.DataFrame()
    submission["UID"] = result.index
    submission["Tag"] = result["Tag"].values
    submission.to_csv("../Submission/submission.csv", index=False)

In [52]:
operation_train = pd.read_csv("../Data/operation_train_new.csv")
transaction_train = pd.read_csv("../Data/transaction_train_new.csv")
tag = pd.read_csv("../Data/tag_train_new.csv")
operation_test = pd.read_csv("../Data/operation_round1_new.csv")
transaction_test = pd.read_csv("../Data/transaction_round1_new.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [56]:
operation_x_train, operation_y_train, operation_x_test, transaction_x_train, transaction_y_train, transaction_x_test = DataPreprocessing()

Merge tag to operation and transaction ...
Fillna ...
Operation label encode ...
Transaction label encode ...


In [61]:
operation_rf_score = ModelEvaluation(RandomForestClassifier(), operation_x_train, operation_y_train)
print("operation_rf_score: ", operation_rf_score)
transaction_rf_score = ModelEvaluation(RandomForestClassifier(), transaction_x_train, transaction_y_train)
print("transaction_rf_score: ", transaction_rf_score)



operation_rf_score:  0.4879438472280517


In [65]:
operation_rf = Fit(RandomForestClassifier(), operation_x_train, operation_y_train)
transaction_rf = Fit(RandomForestClassifier(), transaction_x_train, transaction_y_train)

operation_y_pred = Predict(operation_rf, operation_x_test)
transaction_y_pred = Predict(transaction_rf, transaction_x_test)



In [117]:
operation_xgb = Fit(xgb.XGBClassifier(), operation_x_train, operation_y_train)
transaction_xgb = Fit(xgb.XGBClassifier(), transaction_x_train, transaction_y_train)

operation_y_pred = Predict(operation_xgb, operation_x_test)
transaction_y_pred = Predict(transaction_xgb, transaction_x_test)

In [118]:
operation_result = GetResult(operation_test, operation_y_pred)
transaction_result = GetResult(transaction_test, transaction_y_pred)

In [119]:
GetSubmission(operation_result, transaction_result)