In [2]:
import numpy as np
from numpy.core.fromnumeric import _all_dispatcher
import pandas as pd
import joblib
np.random.seed(2021)

In [3]:
##########################################################
######### training code (without any validation) #########

# load data
print('loading train.gz...')
# use only a subset of rows - you should use all rows eventually
df_train = pd.read_csv("train.gz", compression='gzip', nrows=20000, header='infer')
Y = df_train['click']
# discard some columns
unused_cols = ["id", 'site_id', 'app_id']
df_train.drop(unused_cols, axis=1, inplace=True)

loading train.gz...


In [4]:
# create a copy to prevent from modifying the original dataset
df_copy = df_train.copy()

In [5]:
# nunique() shows that the three columns are same across 
df_copy.drop(['hour', 'app_domain', 'app_category', 'click'], axis=1,inplace=True)

In [6]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(df_copy, Y, test_size=0.3)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14000 entries, 10657 to 6201
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   C1                14000 non-null  int64 
 1   banner_pos        14000 non-null  int64 
 2   site_domain       14000 non-null  object
 3   site_category     14000 non-null  object
 4   device_id         14000 non-null  object
 5   device_ip         14000 non-null  object
 6   device_model      14000 non-null  object
 7   device_type       14000 non-null  int64 
 8   device_conn_type  14000 non-null  int64 
 9   C14               14000 non-null  int64 
 10  C15               14000 non-null  int64 
 11  C16               14000 non-null  int64 
 12  C17               14000 non-null  int64 
 13  C18               14000 non-null  int64 
 14  C19               14000 non-null  int64 
 15  C20               14000 non-null  int64 
 16  C21               14000 non-null  int64 
dtypes: int64(

In [9]:
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# one_hot_features = ['C1', 'device_type', 'device_conn_type', 'C18']
one_hot_features = [0,7,8,13]
one_hot_transformer = OneHotEncoder(drop='first')

# target_features = ['banner_pos','site_domain','site_category','device_id','device_ip','device_model','C14',
#                    'C15','C16','C17','C19','C20','C21']
target_features = [1,2,3,4,5,6,9,10,11,12,14,15,16]
target_transformer = ce.JamesSteinEncoder()
        
preprocessor = ColumnTransformer(
    transformers=[
        ('one_hot', one_hot_transformer, one_hot_features),
        ('target', target_transformer, target_features)])

In [24]:
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', CatBoostClassifier(iterations=20,learning_rate=0.1,depth=7, eval_metric='Logloss'))])

clf.fit(X_train.values, y_train.values.reshape(-1,1))
y_pred = clf.predict_proba(X_test.values)[:, 1]
print("model logloss: %.3f" % log_loss(y_test, y_pred))

  elif pd.api.types.is_categorical(cols):


0:	learn: 0.6160399	total: 4.02ms	remaining: 76.4ms
1:	learn: 0.5533086	total: 7.28ms	remaining: 65.5ms
2:	learn: 0.5003023	total: 10.1ms	remaining: 57ms
3:	learn: 0.4554998	total: 13.2ms	remaining: 52.7ms
4:	learn: 0.4170045	total: 16.4ms	remaining: 49.1ms
5:	learn: 0.3836853	total: 18.5ms	remaining: 43.1ms
6:	learn: 0.3553075	total: 21ms	remaining: 38.9ms
7:	learn: 0.3301982	total: 23.8ms	remaining: 35.7ms
8:	learn: 0.3088449	total: 26.6ms	remaining: 32.5ms
9:	learn: 0.2898885	total: 30ms	remaining: 30ms
10:	learn: 0.2740571	total: 33.1ms	remaining: 27.1ms
11:	learn: 0.2594245	total: 35.9ms	remaining: 23.9ms
12:	learn: 0.2470110	total: 39.4ms	remaining: 21.2ms
13:	learn: 0.2355591	total: 42.9ms	remaining: 18.4ms
14:	learn: 0.2252611	total: 47ms	remaining: 15.7ms
15:	learn: 0.2156349	total: 49.6ms	remaining: 12.4ms
16:	learn: 0.2068159	total: 52.3ms	remaining: 9.23ms
17:	learn: 0.2003635	total: 56.2ms	remaining: 6.24ms
18:	learn: 0.1942897	total: 58.7ms	remaining: 3.09ms
19:	learn: 0.

#### First baseline model has a very high logloss at 0.525.

In [26]:
categorical_f = ['C1','device_type','device_conn_type','C18','banner_pos','site_domain','site_category',
                 'device_id','device_ip','device_model','C14','C15','C16','C17','C19','C20','C21']
cat = CatBoostClassifier(iterations=20,learning_rate=0.1,depth=7, eval_metric='Logloss')
cat.fit(X_train, y_train,cat_features=categorical_f)
y_pred_cat = cat.predict_proba(X_test.values)[:, 1]
print("model logloss: %.3f" % log_loss(y_test, y_pred_cat))

0:	learn: 0.6538872	total: 14.2ms	remaining: 270ms
1:	learn: 0.6239209	total: 22.4ms	remaining: 201ms
2:	learn: 0.5968163	total: 29.5ms	remaining: 167ms
3:	learn: 0.5743491	total: 37.6ms	remaining: 150ms
4:	learn: 0.5560601	total: 42.7ms	remaining: 128ms
5:	learn: 0.5410683	total: 47.1ms	remaining: 110ms
6:	learn: 0.5274857	total: 55.6ms	remaining: 103ms
7:	learn: 0.5156095	total: 62.2ms	remaining: 93.2ms
8:	learn: 0.5057243	total: 66.1ms	remaining: 80.8ms
9:	learn: 0.4972880	total: 74ms	remaining: 74ms
10:	learn: 0.4904956	total: 78.2ms	remaining: 64ms
11:	learn: 0.4842027	total: 85.4ms	remaining: 56.9ms
12:	learn: 0.4787759	total: 93.5ms	remaining: 50.3ms
13:	learn: 0.4747286	total: 97.8ms	remaining: 41.9ms
14:	learn: 0.4708115	total: 105ms	remaining: 34.9ms
15:	learn: 0.4677292	total: 111ms	remaining: 27.9ms
16:	learn: 0.4646293	total: 118ms	remaining: 20.8ms
17:	learn: 0.4629520	total: 121ms	remaining: 13.5ms
18:	learn: 0.4606878	total: 127ms	remaining: 6.7ms
19:	learn: 0.4586883	t

#### using the default catboost encoder seems to get much better result

### Bayesian Search CV for hyperparameter tuning

In [31]:
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer

cat1 = CatBoostClassifier(iterations=20,learning_rate=0.1,depth=7,loss_function='Logloss', 
                          cat_features=categorical_f,verbose=False)

param = {
    'iterations': Integer(10, 1000),
    'depth': Integer(1, 8),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'random_strength': Real(1e-9, 10, 'log-uniform'),
    'bagging_temperature': Real(0.0, 1.0),
    'border_count': Integer(1, 255),
    'l2_leaf_reg': Integer(2, 30),
    'scale_pos_weight':Real(0.01, 1.0, 'uniform')
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    cat1,
    param,
    scoring = LogLoss,
    n_iter=64,
    cv=5,
    random_state=42
)

# executes bayesian optimization
opt.fit(X_train, y_train)

BayesSearchCV(cv=5, error_score='raise',
              estimator=<catboost.core.CatBoostClassifier object at 0x7ff8930bd3d0>,
              fit_params=None, iid=True, n_iter=64, n_jobs=1, n_points=1,
              optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=42,
              refit=True, return_train_score=False,
              scoring=make_scorer(log_loss, greater_is_better=False, needs_proba=True),
              search_spaces={'bagg...
                             'iterations': Integer(low=10, high=1000, prior='uniform', transform='identity'),
                             'l2_leaf_reg': Integer(low=2, high=30, prior='uniform', transform='identity'),
                             'learning_rate': Real(low=0.01, high=1.0, prior='log-uniform', transform='identity'),
                             'random_strength': Real(low=1e-09, high=10, prior='log-uniform', transform='identity'),
                             'scale_pos_weight': Real(low=0.01, high=1.0, prior='uniform', tr

In [35]:
opt.best_params_

OrderedDict([('bagging_temperature', 0.0),
             ('border_count', 48),
             ('depth', 8),
             ('iterations', 759),
             ('l2_leaf_reg', 3),
             ('learning_rate', 0.01),
             ('random_strength', 1e-09),
             ('scale_pos_weight', 1.0)])

In [33]:
best_params={'bagging_temperature': 0.0,
 'border_count': 48,
 'depth': 8,
 'iterations': 759,
 'l2_leaf_reg': 3,
 'learning_rate': 0.01,
 'random_strength': 1e-09,
 'scale_pos_weight': 1.0}

best_params['iterations'] = 1000

from imb

opt_cat = CatBoostClassifier(**best_params, task_type = "GPU",od_type='Iter', loss_function='Logloss')
opt_cat.fit(X_train, y_train,cat_features=categorical_f)
y_pred_cat = cat.predict_proba(X_test.values)[:, 1]

-0.43869105458047825

In [37]:
y_train.value_counts()

0    11312
1     2688
Name: click, dtype: int64

In [73]:
# need to balance the data
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14000 entries, 10657 to 6201
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   C1                14000 non-null  int64 
 1   banner_pos        14000 non-null  int64 
 2   site_domain       14000 non-null  object
 3   site_category     14000 non-null  object
 4   device_id         14000 non-null  object
 5   device_ip         14000 non-null  object
 6   device_model      14000 non-null  object
 7   device_type       14000 non-null  int64 
 8   device_conn_type  14000 non-null  int64 
 9   C14               14000 non-null  int64 
 10  C15               14000 non-null  int64 
 11  C16               14000 non-null  int64 
 12  C17               14000 non-null  int64 
 13  C18               14000 non-null  int64 
 14  C19               14000 non-null  int64 
 15  C20               14000 non-null  int64 
 16  C21               14000 non-null  int64 
dtypes: int64(

In [None]:
def random_search(param_grid, out_file, max_evals = MAX_EVALS):
    """Random search for hyperparameter optimization. 
       Writes result of search to csv file every search iteration."""
    
    
    # Dataframe for results
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                                  index = list(range(MAX_EVALS)))
    for i in range(MAX_EVALS):
        
        # Choose random hyperparameters
        random_params = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        random_params['subsample'] = 1.0 if random_params['boosting_type'] == 'goss' else random_params['subsample']

        # Evaluate randomly selected hyperparameters
        eval_results = objective(random_params, i)
        results.loc[i, :] = eval_results

        # open connection (append option) and write results
        of_connection = open(out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow(eval_results)
        
        # make sure to close connection
        of_connection.close()
        
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)

    return results 

### Logistic Regression (provided)

In [16]:
# embedding (all features are categorical)
print('embedding...')
from sklearn.feature_extraction import DictVectorizer
import pickle

try:
    with open('X_train_dict.pkl', 'rb') as ff:
        X_train_dict = pickle.load(ff)
    vectorizer = joblib.load('vectorizer.joblib')
    X_train = vectorizer.transform(X_train_dict)
    print('saved vectorizer loaded & applied to training set')
except:
    X_train_dict = list(df_copy.drop('click', axis=1).T.to_dict().values())
    with open('X_train_dict.pkl', 'wb') as ff:
        pickle.dump(X_train_dict, ff)
    vectorizer = DictVectorizer(sparse=True)
    X_train = vectorizer.fit_transform(X_train_dict) # can only see training dataset
    joblib.dump(vectorizer, 'vectorizer.joblib')
    print('imported data & built a vectorizer on the training set')

n, d = X_train.shape
print("n = {}, d = {}".format(n, d))

embedding...
imported data & built a vectorizer on the training set
n = 20000, d = 14076


In [17]:
X_train

<20000x14076 sparse matrix of type '<class 'numpy.float64'>'
	with 340000 stored elements in Compressed Sparse Row format>

In [18]:
# train
from sklearn.linear_model import LogisticRegression
print('fit a simple logistic regression with l1 regularization...')
clf = LogisticRegression(max_iter=20000, penalty='l1', solver='liblinear', C=1)
clf.fit(X_train, y_train)
print('...done training')

fit a simple logistic regression with l1 regularization...
...done training


In [20]:
##########################################################
######### testing code ###################################

# transform test data as well
print('loading and transforming test data...')
df_test = pd.read_csv("test.gz", compression='gzip', header='infer')
# df_test.set_index('id', inplace=True)
unused_cols = ['site_id', 'app_id']
df_test.drop(unused_cols, axis=1, inplace=True)

try:
    with open('X_test_dict.pkl', 'rb') as ff:
        X_test_dict = pickle.load('ff')
except:
    X_test_dict = list(df_test.T.to_dict().values())
    with open('X_test_dict.pkl', 'wb') as ff:
        pickle.dump(X_test_dict, ff)

X_test = vectorizer.transform(X_test_dict)

loading and transforming test data...


In [21]:
print('predicting and output to csv...')
ctr_pred = clf.predict_proba(X_test)[:, 1]
# save output: every line is (id, ctr_pred)
all_id = df_test['id']
df_out = pd.DataFrame({'id': all_id, 'ctr': ctr_pred})
df_out.to_csv('Submission.csv', index=False)

print('...done')

predicting and output to csv...
...done
