In [1]:
def check_digit(x):
    try:
        odd = map(int, ','.join(x[-1::-2]).split(','))
        even = map(int, ','.join(x[-2::-2]).split(','))
        sum_odd3 = sum(odd) * 3
        total = sum_odd3 + sum(even)
        rem = total % 10
        if rem == 0:
            return rem
        return 10 - rem
    except:
        return -9999

In [2]:
def category_counts(data):
    """
    Count total number of unique DepartmentDescription made on each trip.
    """
    counts = []
    for array in np.asarray(data.loc[:, "1-HR PHOTO":"WIRELESS"]):
        count = 0
        for item in array:
            if item > 0:
                count += 1
        counts.append(count)
    cat_counts = pd.DataFrame(counts)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    
    data.insert(6, "CategoryCounts", cat_counts)
    
    return data

In [3]:
def company(x):
    try:
        p = x[:6]
        if p == "000000":
            return x[-5]
        return p
    except:
        return -9999
    
def float_to_str(obj):
    while obj != "nan":
        obj = str(obj).split(".")[0]
        return obj

In [4]:
from collections import Counter
def mode(x):
    counts = Counter(x)
    max_count = max(counts.values())
    ls = [x_i for x_i, count in counts.items() if count == max_count]
    return ls[0]

In [5]:
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')

In [6]:
train["Upc"] = train.Upc.apply(float_to_str)
test["Upc"] = test.Upc.apply(float_to_str)

train["company"] = train.Upc.apply(company) 
test["company"] = test.Upc.apply(company)

In [7]:
train.loc[train["ScanCount"] < 0, "Return"] = 1
train.loc[train["Return"] != 1, "Return"] = 0

test.loc[test["ScanCount"] < 0, "Return"] = 1
test.loc[test["Return"] != 1, "Return"] = 0

In [8]:
wd = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, 
      "Friday": 4, "Saturday": 5, "Sunday": 6}

train["Weekday"] = train["Weekday"].apply(lambda x: wd[x])
test["Weekday"] = test["Weekday"].apply(lambda x: wd[x])

In [9]:
train["Pos_Sum"] = train["ScanCount"]
test["Pos_Sum"] = test["ScanCount"]

train.loc[train["Pos_Sum"] < 0, "Pos_Sum"] = 0
test.loc[test["Pos_Sum"] < 0, "Pos_Sum"] = 0

In [10]:
train["Neg_Sum"] = train["ScanCount"]
test["Neg_Sum"] = test["ScanCount"]

train.loc[train["Neg_Sum"] > 0, "Neg_Sum"] = 0
test.loc[test["Neg_Sum"] > 0, "Neg_Sum"] = 0

In [11]:
train["check"] = train.Upc.apply(check_digit)
test["check"] = test.Upc.apply(check_digit)

In [12]:
train = train.drop(["Upc", "FinelineNumber"], axis=1)
test = test.drop(["Upc", "FinelineNumber"], axis=1)

In [13]:
train_dd = pd.get_dummies(train["DepartmentDescription"])
test_dd = pd.get_dummies(test["DepartmentDescription"])

train_dd = pd.concat([train[["VisitNumber"]], train_dd], axis=1)
test_dd = pd.concat([test[["VisitNumber"]], test_dd], axis=1)

train_dd = train_dd.groupby("VisitNumber", as_index=False).sum()
test_dd = test_dd.groupby("VisitNumber", as_index=False).sum()

In [14]:
train_dd.shape, test_dd.shape

((95674, 69), (95674, 68))

In [15]:
train_company = train[["VisitNumber", "company"]]
test_company = test[["VisitNumber", "company"]]

In [16]:
train_company = train_company.groupby("VisitNumber", as_index=False).agg(mode)
test_company = test_company.groupby("VisitNumber", as_index=False).agg(mode)

In [80]:
train_company.shape, test_company.shape

((95674, 2), (95674, 2))

In [17]:
train_check = train[["VisitNumber", "check"]]
test_check = test[["VisitNumber", "check"]]

train_check = train_check.groupby("VisitNumber", as_index=False).agg(mode)
test_check = test_check.groupby("VisitNumber", as_index=False).agg(mode)

In [18]:
train_by_sum = train[["VisitNumber", "ScanCount", "Pos_Sum", "Neg_Sum"]]
test_by_sum = test[["VisitNumber", "ScanCount", "Pos_Sum", "Neg_Sum"]]

train_by_max = train[["TripType", "VisitNumber", "Weekday", "Return"]]
test_by_max = test[["VisitNumber", "Weekday", "Return"]]

In [19]:
train_by_sum = train_by_sum.groupby("VisitNumber", as_index=False).sum()
test_by_sum = test_by_sum.groupby("VisitNumber", as_index=False).sum()

train_by_max = train_by_max.groupby("VisitNumber", as_index=False).max()
test_by_max = test_by_max.groupby("VisitNumber", as_index=False).max()

In [20]:
train = train_by_sum.merge(train_by_max, on=["VisitNumber"])
train = train.merge(train_dd, on=["VisitNumber"])
train = train.merge(train_company, on=["VisitNumber"])
train = train.merge(train_check, on=["VisitNumber"])

test = test_by_sum.merge(test_by_max, on=["VisitNumber"])
test = test.merge(test_dd, on=["VisitNumber"])
test = test.merge(test_company, on=["VisitNumber"])
test = test.merge(test_check, on=["VisitNumber"])

In [21]:
y = train["TripType"]
train = train.drop(["TripType", "HEALTH AND BEAUTY AIDS"], axis=1)

In [22]:
train = category_counts(train)
test = category_counts(test)

In [23]:
train = pd.get_dummies(train, columns=["Weekday", "Return"])
test = pd.get_dummies(test, columns=["Weekday", "Return"])

In [24]:
train.shape, test.shape

((95674, 83), (95674, 83))

In [25]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [26]:
label_enc = LabelEncoder().fit(y)
y_labeled = label_enc.transform(y)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    train, y_labeled, random_state=0)

In [28]:
dtrain = xgb.DMatrix(X_train.values, label=y_train)
dtest = xgb.DMatrix(X_test.values, label=y_test)

In [29]:
num_boost_round = 300
params = {'objective': 'multi:softprob', 
          'eval_metric': 'mlogloss',
          'num_class':38, 
          'max_delta_step': 3, 
          'eta': 0.25}

evals = [(dtrain, 'train'), (dtest, 'eval')]


bst = xgb.train(params=params,  
                dtrain=dtrain, 
                num_boost_round=num_boost_round, 
                evals=evals,
               early_stopping_rounds=10,)

[0]	train-mlogloss:2.93823	eval-mlogloss:2.94892
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 10 rounds.
[1]	train-mlogloss:2.35311	eval-mlogloss:2.37942
[2]	train-mlogloss:1.93003	eval-mlogloss:1.9707
[3]	train-mlogloss:1.6803	eval-mlogloss:1.73115
[4]	train-mlogloss:1.50745	eval-mlogloss:1.56622
[5]	train-mlogloss:1.37688	eval-mlogloss:1.44358
[6]	train-mlogloss:1.2759	eval-mlogloss:1.35047
[7]	train-mlogloss:1.19559	eval-mlogloss:1.2766
[8]	train-mlogloss:1.13028	eval-mlogloss:1.21721
[9]	train-mlogloss:1.07474	eval-mlogloss:1.16754
[10]	train-mlogloss:1.02785	eval-mlogloss:1.12674
[11]	train-mlogloss:0.988206	eval-mlogloss:1.09259
[12]	train-mlogloss:0.954186	eval-mlogloss:1.06379
[13]	train-mlogloss:0.924516	eval-mlogloss:1.03978
[14]	train-mlogloss:0.898925	eval-mlogloss:1.01908
[15]	train-mlogloss:0.875435	eval-mlogloss:1.00065
[16]	train-mlogloss:0.855257	eval-mlogloss:0.984354
[17]	t

In [30]:
classes = np.array(list(set(label_enc.inverse_transform(y_labeled))))

In [31]:
dmtest = xgb.DMatrix(test.values) 
pred_proba = bst.predict(dmtest)

proba_df = pd.DataFrame(pred_proba, columns=classes)
proba_df.columns = proba_df.columns.map(lambda x: "TripType_" + str(x))
sub_df = pd.concat([test[["VisitNumber"]], proba_df], axis=1)
sub_df.to_csv("../submission_0410_noProb_xgb_02.csv", index=False)