In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import matthews_corrcoef, make_scorer
import xgboost as xgb
import matplotlib.pyplot as plt

## Dataset

### Load dataset

In [2]:
df_train = pd.read_csv('marketing-analytics-dataset/train.csv', encoding='utf8', parse_dates=['grass_date'])
df_test  = pd.read_csv('marketing-analytics-dataset/test.csv', encoding='utf8',  parse_dates=['grass_date'])
df_users = pd.read_csv('marketing-analytics-dataset/users.csv', encoding='utf8')
df_submission = pd.read_csv('marketing-analytics-dataset/sample_submission_0_1.csv')

In [3]:
df_train.tail()

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,login_count_last_10_days,login_count_last_30_days,login_count_last_60_days,checkout_count_last_10_days,checkout_count_last_30_days,checkout_count_last_60_days,open_flag,row_id
73534,6,2019-09-02 00:00:00+08:00,127613,39,24,36,279,0,1,1,0,0,0,0,0,0,0,73534
73535,2,2019-09-02 00:00:00+08:00,127620,38,46,10,51,0,0,1,0,0,0,0,0,0,0,73535
73536,2,2019-09-02 00:00:00+08:00,127696,32,Never open,Never login,Never checkout,0,0,0,0,0,0,0,0,0,0,73536
73537,2,2019-09-02 00:00:00+08:00,127807,38,5,34,Never checkout,2,4,4,0,0,0,0,0,0,1,73537
73538,6,2019-09-02 00:00:00+08:00,127880,39,1,3,Never checkout,2,2,3,0,0,0,0,0,0,0,73538


In [4]:
df_test.tail()

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,login_count_last_10_days,login_count_last_30_days,login_count_last_60_days,checkout_count_last_10_days,checkout_count_last_30_days,checkout_count_last_60_days,row_id
55965,6,2019-09-29 00:00:00+08:00,127348,53,Never open,4,8,0,0,0,0,0,0,0,0,0,55965
55966,6,2019-09-29 00:00:00+08:00,127396,53,59,802,1207,0,0,1,0,0,0,0,0,0,55966
55967,6,2019-09-29 00:00:00+08:00,127574,43,Never open,7,Never checkout,0,0,0,0,0,0,0,0,0,55967
55968,6,2019-09-29 00:00:00+08:00,127887,43,5,5,6,2,5,14,0,0,0,0,0,0,55968
55969,6,2019-09-29 00:00:00+08:00,127895,53,5,3,20,2,14,27,0,0,0,0,0,0,55969


In [5]:
df_users.tail()

Unnamed: 0,user_id,attr_1,attr_2,attr_3,age,domain
127881,127921,,1.0,0.0,,@yahoo.com
127882,127922,1.0,1.0,0.0,20.0,@gmail.com
127883,127923,,1.0,0.0,,@gmail.com
127884,127924,,0.0,0.0,,@gmail.com
127885,127925,,1.0,0.0,,@gmail.com


### EDA

In [6]:
np.unique(df_train['country_code'])

array([1, 2, 3, 4, 5, 6, 7], dtype=int64)

In [7]:
np.unique(df_train['open_flag'])

array([0, 1], dtype=int64)

In [8]:
len(df_train['open_flag'][df_train['open_flag'] == 0]), len(df_train['open_flag'][df_train['open_flag'] == 1])

(62083, 11456)

In [9]:
class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(df_train['open_flag']),
                                                  df_train['open_flag'])
class_weights

1        1
2        0
3        0
4        0
        ..
73534    0
73535    0
73536    0
73537    1
73538    0
Name: open_flag, Length: 73539, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


array([0.59226358, 3.20962814])

In [10]:
n_unique = len(np.unique(df_train['user_id']))
print("# unique users:", n_unique)
print("repeat ratio:", (len(df_train) - n_unique) / len(df_train))

# unique users: 72845
repeat ratio: 0.009437169393111138


In [11]:
print(len(df_train))
print(len(df_train[(df_train['last_open_day'] == 'Never open') & (df_train['open_flag'] == 1)]))
print(len(df_train[(df_train['last_login_day'] == 'Never login') & (df_train['open_flag'] == 1)]))
print(len(df_train[(df_train['last_checkout_day'] == 'Never checkout') & (df_train['open_flag'] == 1)]))

73539
463
74
2334


In [12]:
np.unique(df_train['open_flag'])

array([0, 1], dtype=int64)

In [13]:
print("# of samples:        ", len(df_train))
print("# of open_flag==1000:", len(df_train[df_train['open_flag'] == 1000]))

# of samples:         73539
# of open_flag==1000: 0


In [14]:
min_date = df_train['grass_date'].min()
min_date

Timestamp('2019-07-16 00:00:00+0800', tz='pytz.FixedOffset(480)')

#### Users

In [16]:
print(np.unique(df_users['attr_1'].dropna()))
print(np.unique(df_users['attr_2'].dropna()))
print(np.unique(df_users['attr_3'].dropna()))

[0. 1.]
[0. 1.]
[0. 1. 2. 3. 4.]


In [17]:
print("#Users:", len(df_users))
print(len(df_users['attr_1'].dropna()))
print(len(df_users['attr_2'].dropna()))
print(len(df_users['attr_3'].dropna()))

#Users: 127886
78987
127439
127886


### Preprocess

In [18]:
df_train['has_opened'] = 1*(df_train['last_open_day'] != 'Never open')
df_train.loc[df_train['last_open_day'] == 'Never open', 'last_open_day'] = 1000
df_train['last_open_day'] = df_train['last_open_day'].astype(int)
df_test['has_opened']  =  1*(df_test['last_open_day'] != 'Never open')
df_test.loc[df_test['last_open_day'] == 'Never open', 'last_open_day']   = 1000
df_test['last_open_day'] = df_test['last_open_day'].astype(int)

In [19]:
df_train['has_login'] = 1*(df_train['last_login_day'] != 'Never login')
df_train.loc[df_train['last_login_day'] == 'Never login', 'last_login_day'] = 1000
df_train['last_login_day'] = df_train['last_login_day'].astype(int)
df_test['has_login']  =  1*(df_test['last_login_day'] != 'Never login')
df_test.loc[df_test['last_login_day'] == 'Never login', 'last_login_day']   = 1000
df_test['last_login_day'] = df_test['last_login_day'].astype(int)

In [20]:
df_train['has_checkouted'] = 1*(df_train['last_checkout_day'] != 'Never checkout')
df_train.loc[df_train['last_checkout_day'] == 'Never checkout', 'last_checkout_day'] = 1000
df_train['last_checkout_day'] = df_train['last_checkout_day'].astype(int)
df_test['has_checkouted']  =  1*(df_test['last_checkout_day'] != 'Never checkout')
df_test.loc[df_test['last_checkout_day'] == 'Never checkout', 'last_checkout_day']   = 1000
df_test['last_checkout_day'] = df_test['last_checkout_day'].astype(int)

### Dates

In [35]:
#df_train['year']  = df_train['grass_date'].dt.year
#df_train['month'] = df_train['grass_date'].dt.month
#df_train['day']   = df_train['grass_date'].dt.day
df_train['days_from_start'] = (df_train['grass_date'] - min_date).dt.days

In [36]:
#df_test['year']   = df_test['grass_date'].dt.year
#df_test['month']  = df_test['grass_date'].dt.month
#df_test['day']    = df_test['grass_date'].dt.day
df_test['days_from_start']  = (df_test['grass_date'] - min_date).dt.days

### Merge

In [37]:
df_train_ = pd.concat([df_train, pd.get_dummies(df_train['country_code'], prefix='country')], axis=1)
df_test_  = pd.concat([df_test,  pd.get_dummies(df_test['country_code'],  prefix='country')], axis=1)

In [38]:
df_users_ = pd.concat([df_users, pd.get_dummies(df_users['domain'], prefix='domain')], axis=1)
df_users_.head()

Unnamed: 0,user_id,attr_1,attr_2,attr_3,age,domain,domain_@163.com,domain_@gmail.com,domain_@hotmail.com,domain_@icloud.com,domain_@live.com,domain_@outlook.com,domain_@qq.com,domain_@rocketmail.com,domain_@yahoo.com,domain_@ymail.com,domain_other
0,0,,1.0,0.0,,@gmail.com,0,1,0,0,0,0,0,0,0,0,0
1,1,1.0,1.0,2.0,50.0,@gmail.com,0,1,0,0,0,0,0,0,0,0,0
2,2,,1.0,0.0,,other,0,0,0,0,0,0,0,0,0,0,1
3,3,,1.0,0.0,,@gmail.com,0,1,0,0,0,0,0,0,0,0,0
4,4,1.0,1.0,2.0,33.0,@gmail.com,0,1,0,0,0,0,0,0,0,0,0


In [44]:
df_train_merged = df_train_.merge(df_users_, on='user_id', how='left')
df_train_merged.tail()

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,...,domain_@gmail.com,domain_@hotmail.com,domain_@icloud.com,domain_@live.com,domain_@outlook.com,domain_@qq.com,domain_@rocketmail.com,domain_@yahoo.com,domain_@ymail.com,domain_other
73534,6,2019-09-02 00:00:00+08:00,127613,39,24,36,279,0,1,1,...,0,1,0,0,0,0,0,0,0,0
73535,2,2019-09-02 00:00:00+08:00,127620,38,46,10,51,0,0,1,...,1,0,0,0,0,0,0,0,0,0
73536,2,2019-09-02 00:00:00+08:00,127696,32,1000,1000,1000,0,0,0,...,1,0,0,0,0,0,0,0,0,0
73537,2,2019-09-02 00:00:00+08:00,127807,38,5,34,1000,2,4,4,...,1,0,0,0,0,0,0,0,0,0
73538,6,2019-09-02 00:00:00+08:00,127880,39,1,3,1000,2,2,3,...,1,0,0,0,0,0,0,0,0,0


In [45]:
df_test_merged = df_test_.merge(df_users_, on='user_id')
df_test_merged.tail()

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,...,domain_@gmail.com,domain_@hotmail.com,domain_@icloud.com,domain_@live.com,domain_@outlook.com,domain_@qq.com,domain_@rocketmail.com,domain_@yahoo.com,domain_@ymail.com,domain_other
55965,6,2019-09-29 00:00:00+08:00,127348,53,1000,4,8,0,0,0,...,1,0,0,0,0,0,0,0,0,0
55966,6,2019-09-29 00:00:00+08:00,127396,53,59,802,1207,0,0,1,...,1,0,0,0,0,0,0,0,0,0
55967,6,2019-09-29 00:00:00+08:00,127574,43,1000,7,1000,0,0,0,...,0,1,0,0,0,0,0,0,0,0
55968,6,2019-09-29 00:00:00+08:00,127887,43,5,5,6,2,5,14,...,1,0,0,0,0,0,0,0,0,0
55969,6,2019-09-29 00:00:00+08:00,127895,53,5,3,20,2,14,27,...,0,1,0,0,0,0,0,0,0,0


In [46]:
df_train_merged.columns

Index(['country_code', 'grass_date', 'user_id', 'subject_line_length',
       'last_open_day', 'last_login_day', 'last_checkout_day',
       'open_count_last_10_days', 'open_count_last_30_days',
       'open_count_last_60_days', 'login_count_last_10_days',
       'login_count_last_30_days', 'login_count_last_60_days',
       'checkout_count_last_10_days', 'checkout_count_last_30_days',
       'checkout_count_last_60_days', 'open_flag', 'row_id', 'has_opened',
       'has_login', 'has_checkouted', 'days_from_start', 'country_1',
       'country_2', 'country_3', 'country_4', 'country_5', 'country_6',
       'country_7', 'attr_1', 'attr_2', 'attr_3', 'age', 'domain',
       'domain_@163.com', 'domain_@gmail.com', 'domain_@hotmail.com',
       'domain_@icloud.com', 'domain_@live.com', 'domain_@outlook.com',
       'domain_@qq.com', 'domain_@rocketmail.com', 'domain_@yahoo.com',
       'domain_@ymail.com', 'domain_other'],
      dtype='object')

### Prepare X-Y

In [47]:
RELEVANT_COLS = [#'country_code', #'grass_date', 
                 #'year', 'month', 'day',
                 'days_from_start',
                 'country_1', 'country_2', 'country_3', 'country_4',
                 'country_5', 'country_6', 'country_7',
                 'subject_line_length', 
                 'last_open_day', 'last_login_day', 'last_checkout_day',
                 'open_count_last_10_days', 'open_count_last_30_days', 'open_count_last_60_days',
                 'login_count_last_10_days', 'login_count_last_30_days', 'login_count_last_60_days',
                 'checkout_count_last_10_days', 'checkout_count_last_30_days', 'checkout_count_last_60_days',
                 'has_opened', 'has_login', 'has_checkouted',
                 #'attr_1', 'attr_2', 'attr_3',
                 'age',
                 'domain_@163.com', 'domain_@gmail.com', 'domain_@hotmail.com',
                 'domain_@icloud.com', 'domain_@live.com', 'domain_@outlook.com',
                 'domain_@qq.com', 'domain_@rocketmail.com', 'domain_@yahoo.com', 
                 'domain_@ymail.com', 'domain_other']
X = df_train_merged[RELEVANT_COLS]
y = df_train_merged['open_flag']
X_test = df_test_merged[RELEVANT_COLS]

In [48]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

In [49]:
X_train.shape, y_train.shape

((66185, 36), (66185,))

In [50]:
X_valid.shape, y_valid.shape

((7354, 36), (7354,))

In [51]:
np.unique(y_train)

array([0, 1], dtype=int64)

## Model

In [52]:
params = {
        #'min_child_weight': [1, 5, 10],
        #'gamma': [0.5, 1, 2],
        #'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 10, 15],
        'n_estimators': [50, 100, 150]
        }

In [53]:
xgb_model = xgb.XGBClassifier(scale_pos_weight=class_weights[1], objective='binary:logistic')

In [54]:
clf = GridSearchCV(xgb_model, params, scoring=make_scorer(matthews_corrcoef))

In [55]:
%%time
# xgb_model.fit(X_train, y_train, verbose=1)
clf.fit(X_train, y_train, verbose=1)

Wall time: 10min 50s


GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=3.2096281424581004,
                                     subsample=None, tree_method=None,
    

### Test

In [56]:
# y_preds = xgb_model.predict(X_valid)
y_preds = clf.predict(X_valid)

In [57]:
matthews_corrcoef(y_valid, y_preds)

0.5241645860278625

In [58]:
0.5209553697223651

0.5209553697223651

### Predict

In [59]:
#y_preds = xgb_model.predict(X_test)
y_preds = clf.predict(X_test)

In [60]:
df_submission['open_flag'] = y_preds

In [61]:
df_submission.to_csv('submissions/submission_9.csv', index=False)