In [1]:
import pandas as pd
import numpy as np

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train_fe=pd.read_csv('train_fe_vals.csv')

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
train=reduce_mem_usage(train)
test=reduce_mem_usage(test)
train_fe=reduce_mem_usage(train_fe)

Memory usage after optimization is: 606.50 MB
Decreased by 62.5%
Memory usage after optimization is: 134.78 MB
Decreased by 50.0%
Memory usage after optimization is: 76.27 MB
Decreased by 68.7%


In [6]:
train=train.merge(train_fe,how='left',left_on='node1_id',right_on='node_id')

In [7]:
del train['node_id']

In [8]:
train=train.merge(train_fe,how='left',left_on='node2_id',right_on='node_id')

In [9]:
del train['node_id']

In [10]:
train.head()

Unnamed: 0,node1_id,node2_id,is_chat,full_chat_mean_x,full_chat_sum_x,full_chat_count_x,full_chat_mean_y,full_chat_sum_y,full_chat_count_y
0,8446602,6636127,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1430102,7433949,0,0.0,0.0,0.0,0.0,0.0,0.0
2,2803017,8372333,0,0.043488,1.0,23.0,0.208374,5.0,24.0
3,4529348,894645,0,0.0,0.0,0.0,0.0,0.0,0.0
4,5096572,4211638,0,0.117676,2.0,17.0,0.0,0.0,0.0


In [11]:
test=test.merge(train_fe,how='left',left_on='node1_id',right_on='node_id')
del test['node_id']
test=test.merge(train_fe,how='left',left_on='node2_id',right_on='node_id')
del test['node_id']

In [12]:
train=train.fillna(0)
test=test.fillna(0)

In [13]:
test.head()

Unnamed: 0,id,node1_id,node2_id,full_chat_mean_x,full_chat_sum_x,full_chat_count_x,full_chat_mean_y,full_chat_sum_y,full_chat_count_y
0,1,7107094,8010772,0.0,0.0,0.0,0.099976,1.0,10.0
1,2,7995251,2805801,0.0,0.0,0.0,0.098389,6.0,61.0
2,3,2804693,8059549,0.0,0.0,0.0,0.0,0.0,0.0
3,4,4812472,7332370,0.0,0.0,0.0,0.0,0.0,0.0
4,5,5009985,4511909,0.027023,1.0,37.0,0.009903,1.0,101.0


In [14]:
user_features=reduce_mem_usage(pd.read_csv('user_features.csv'))

Memory usage after optimization is: 133.98 MB
Decreased by 84.8%


In [15]:
train_1=pd.merge(train, user_features, how='left', left_on='node1_id', right_on='node_id')

In [16]:
train_1=train_1.drop(['node_id'],axis=1)

In [17]:
train_2=pd.merge(train_1, user_features, how='left', left_on='node2_id', right_on='node_id')

In [18]:
train_2=train_2.drop(['node_id','node1_id','node2_id'],axis=1)

In [20]:
train_2['f13_x']=train_2['f13_x'].astype('category')
train_2['f13_y']=train_2['f13_y'].astype('category')
train_2['is_chat']=train_2['is_chat'].astype('category')

In [21]:
y_train=train_2['is_chat']

In [22]:
train_2=train_2.drop(['is_chat'],axis=1)

In [23]:
ids=test.id

In [24]:
test_1=pd.merge(test, user_features, how='left', left_on='node1_id', right_on='node_id')

In [25]:
test_1=test_1.drop(['node_id'],axis=1)

In [26]:
test_2=pd.merge(test_1, user_features, how='left', left_on='node2_id', right_on='node_id')

In [27]:
test_2=test_2.drop(['node1_id','node2_id'],axis=1)

In [28]:
test_2=test_2.drop(['id','node_id'],axis=1)

In [29]:
del test,test_1

In [30]:
del train,train_1,user_features,train_fe

In [31]:
test_2['f13_x']=test_2['f13_x'].astype('category')
test_2['f13_y']=test_2['f13_y'].astype('category')

In [32]:
import time
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_valid, Y_train, Y_valid =train_test_split(train_2,y_train,test_size=0.2,stratify=y_train)

In [34]:
train_data = lgb.Dataset(X_train, label=Y_train,free_raw_data=False)
valid_data = lgb.Dataset(X_valid, label=Y_valid,free_raw_data=False)

In [35]:
import gc
gc.collect()

0

In [36]:
del X_train,Y_train

In [41]:
# callback
def save_model():
    def callback(env):
        model=env.model
        if env.iteration%100==0:
            model.save_model('./final_folder/model_'+str(env.iteration)+'.txt')
    callback.before_iteration = False
    callback.order = 0
    return callback

In [39]:
params_tuned = {
#     'bagging_freq': 5,
#    'bagging_fraction': 0.9,
#    'boost_from_average':'false',
    'boost': 'gbdt',
#     'feature_fraction': 0.05,
    'learning_rate': 0.15,
    'max_depth': -1,  
    'metric':'auc',
#    'min_data_in_leaf': 80,
#     'min_sum_hessian_in_leaf': 10.0,
#     'num_leaves': 13,
    'num_threads': -1,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1,
    'num_leaves': 100,
    "device" : "gpu",
    "gpu_platform_id" : 0,
    "gpu_device_id" : 0
    
}

In [42]:
lgb_model = lgb.train(params_tuned,train_data,num_boost_round=10000,
                valid_sets = [valid_data],verbose_eval=100,early_stopping_rounds = 300,callbacks=[save_model()])

Training until validation scores don't improve for 300 rounds.
[100]	valid_0's auc: 0.972164
[200]	valid_0's auc: 0.972776
[300]	valid_0's auc: 0.972775
[400]	valid_0's auc: 0.972853
[500]	valid_0's auc: 0.972988
[600]	valid_0's auc: 0.973103
[700]	valid_0's auc: 0.973163
[800]	valid_0's auc: 0.973282
[900]	valid_0's auc: 0.97326
[1000]	valid_0's auc: 0.973252
Early stopping, best iteration is:
[792]	valid_0's auc: 0.973291
