## **Import Mudules**

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder


## **Data Processing**

* Since the capacity of the data is quite large (over 4*10^7 rows), one-tenth(or one-fifth) was selected for analysis.

* The test set is concatenated with the training set for data processing.

In [2]:
df = pd.read_csv("./train/train.csv", skiprows = lambda x: x>0 and np.random.rand() > 0.1)

len_train = len(df)
df = pd.concat([df, pd.read_csv("./test/test.csv")])
df.fillna(0, inplace = True)

* The training file only includes ten-days history of user-clicking. This implies sequence analysis could be difficult.

* However, the weekday and hour may be meaningful for analysis.

In [3]:
df['hour'] = pd.to_datetime(df['hour'], format= '%y%m%d%H', errors='ignore')
df['weekday'] = df['hour'].dt.weekday
df['hr_24'] = df['hour'].dt.hour

* According to the discussion board of the competition, the data provider admitted that "ID" is useless.

* "hour" can be dropped after data parsing.

* "C15" is almost identical to "C16". Drop it.

* "C1" and "device_type" are omitted because they are insignificant in model training. (Feature Importance) 

In [4]:
df = df.drop(['id', 'hour','C15','C1','device_type'], axis=1)

### Split The Training and Test Set

In [5]:
x_train = df[:len_train]
x_test = df[len_train:]

In [6]:
y_train = x_train['click']
x_train = x_train.drop(['click'], axis=1)

y_test = x_test['click']
x_test = x_test.drop(['click'], axis=1)

In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4042904 entries, 0 to 4042903
Data columns (total 20 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   banner_pos        int64 
 1   site_id           object
 2   site_domain       object
 3   site_category     object
 4   app_id            object
 5   app_domain        object
 6   app_category      object
 7   device_id         object
 8   device_ip         object
 9   device_model      object
 10  device_conn_type  int64 
 11  C14               int64 
 12  C16               int64 
 13  C17               int64 
 14  C18               int64 
 15  C19               int64 
 16  C20               int64 
 17  C21               int64 
 18  weekday           int64 
 19  hr_24             int64 
dtypes: int64(11), object(9)
memory usage: 647.7+ MB


### Create a Validation Set from the Training Set

In [8]:


x_train_new, x_val, y_train_new, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

# x_test = x_test.drop(['click'], axis=1)

## **Grid Search**

In [9]:
# grid = {'iterations': [50, 100, 150],
#         'learning_rate': [0.1, 0.15, 0.2],
#         'depth': [4, 6, 8],
#         'l2_leaf_reg': [0.2, 0.5, 1]}

# model = CatBoostClassifier(random_state=42,
#                          loss_function='Logloss',
#                          eval_metric='Logloss')

# model.grid_search(grid, x_train, y_train)

## **Model Training**

In [10]:
cat_features = range(x_train_new.shape[1])
model = CatBoostClassifier(
    iterations=50,
    learning_rate=0.5,
    task_type='GPU',
    loss_function='Logloss',
#     depth=6,
)

In [11]:
model.fit(
    x_train_new, y_train_new,
    eval_set=(x_val, y_val),
    cat_features=cat_features,
    verbose=10,
)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0:	learn: 0.4624009	test: 0.4623564	best: 0.4623564 (0)	total: 60.3ms	remaining: 2.95s
10:	learn: 0.3998130	test: 0.3981294	best: 0.3981294 (10)	total: 656ms	remaining: 2.32s
20:	learn: 0.3973741	test: 0.3959350	best: 0.3959350 (20)	total: 1.33s	remaining: 1.84s
30:	learn: 0.3962113	test: 0.3948502	best: 0.3948502 (30)	total: 1.93s	remaining: 1.18s
40:	learn: 0.3952616	test: 0.3939404	best: 0.3939404 (40)	total: 2.53s	remaining: 555ms
49:	learn: 0.3948362	test: 0.3935644	best: 0.3935644 (49)	total: 3.06s	remaining: 0us
bestTest = 0.3935643893
bestIteration = 49


<catboost.core.CatBoostClassifier at 0x1407e3d9d00>

## **Feature Importance**

Omit those features with only small importances

In [12]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,app_id,27.215596
1,site_id,24.248651
2,C17,8.326144
3,device_id,6.571508
4,site_domain,5.333904
5,C14,5.08466
6,device_ip,5.033141
7,device_model,4.352468
8,C21,3.928434
9,app_domain,1.729541


In [13]:
y_test_pred = model.predict(x_test, 
                        prediction_type='Probability', 
                        ntree_start=0, ntree_end=model.get_best_iteration(), 
                        thread_count=-1, verbose=None)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [14]:
submission = pd.read_csv('./sampleSubmission/sampleSubmission.csv')
submission[submission.columns[1]] = y_test_pred[:,1]
submission.to_csv('submission.csv', index=False)

# **Results of Validation Set**

In [15]:
y_val_pred = model.predict(x_val, 
                        prediction_type='Probability', 
                        ntree_start=0, ntree_end=model.get_best_iteration(), 
                        thread_count=-1, verbose=None)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [16]:
result = (y_val_pred[:,1] > y_val_pred[:,0]).astype(int)

In [17]:
print(classification_report(y_val, result))

              precision    recall  f1-score   support

         0.0       0.84      0.99      0.91    671216
         1.0       0.61      0.08      0.14    137365

    accuracy                           0.84    808581
   macro avg       0.73      0.54      0.53    808581
weighted avg       0.80      0.84      0.78    808581

