In [1]:
import os, time
start = time.time()
very_start = time.time()

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
import dask
from dask.distributed import Client, wait, LocalCluster
import dask.dataframe as dd

In [4]:

client = Client(n_workers=12, 
                       threads_per_worker=8,
                       memory_limit='480GB')
dask.config.set(shuffle='disk')
dask.config.set({'temporary_directory': '/mnt/DP_disk1/dask_tmp'})

<dask.config.set at 0x7f50e16652d0>

# Load Train

In [5]:
train = dd.read_parquet(f'/path/to/train_with_features/*.parquet')
valid = dd.read_parquet(f'/path/to/validate_with_features/*.parquet')

In [6]:
valid.columns

Index(['mentioned_bucket_id', 'engaging_user_id', 'mentioned_count',
       'has_mention', 'second_used_word_bucket_id', 'most_used_word_bucket_id',
       'mention', 'engaged_with_user_id', 'language', 'has_rt',
       ...
       'TE_has_mention_engaging_user_id_retweet_with_comment_timestamp',
       'TE_has_mention_engaging_user_id_like_timestamp',
       'TE_mentioned_count_engaging_user_id_reply_timestamp',
       'TE_mentioned_count_engaging_user_id_retweet_timestamp',
       'TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',
       'TE_mentioned_count_engaging_user_id_like_timestamp',
       'TE_mentioned_bucket_id_engaging_user_id_reply_timestamp',
       'TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',
       'TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',
       'TE_mentioned_bucket_id_engaging_user_id_like_timestamp'],
      dtype='object', length=179)

In [7]:
train['a_ff_rate'] = (train['engaged_with_user_following_count'] / train['engaged_with_user_follower_count']).astype('float32')
train['b_ff_rate'] = (train['engaging_user_follower_count']  / train['engaging_user_following_count']).astype('float32')

In [8]:
list(valid.columns)

['mentioned_bucket_id',
 'engaging_user_id',
 'mentioned_count',
 'has_mention',
 'second_used_word_bucket_id',
 'most_used_word_bucket_id',
 'mention',
 'engaged_with_user_id',
 'language',
 'has_rt',
 'tweet_type',
 'tweet_hour',
 'tweet_dow',
 'tw_last_word',
 'hashtags',
 'present_media',
 'present_links',
 'present_domains',
 'engaged_with_user_is_verified',
 'engaging_user_is_verified',
 'engagee_follows_engager',
 'tw_first_word',
 'tweet_id',
 'tweet_timestamp',
 'engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_account_creation',
 'engaging_user_follower_count',
 'engaging_user_following_count',
 'engaging_user_account_creation',
 'reply_timestamp',
 'retweet_timestamp',
 'retweet_with_comment_timestamp',
 'like_timestamp',
 'tweet',
 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp',
 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_med

In [9]:
label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [10]:
cols =['has_photo',
'has_video',
'has_gif',
'a_ff_rate',
'b_ff_rate',
'dt_dow',
'dt_hour',
'reply_timestamp',
'retweet_timestamp',
'retweet_with_comment_timestamp',
'like_timestamp',
'engaged_with_user_follower_count',
'engaged_with_user_following_count',
'engaging_user_follower_count',
'engaging_user_following_count',
'engaged_with_user_is_verified',
'engaging_user_is_verified',
'GTE_language_reply_timestamp_indicator_engaged_with_user_id',
'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_language_like_timestamp_indicator_engaged_with_user_id',
'GTE_language_reply_timestamp_indicator_engaging_user_id',
'GTE_language_retweet_timestamp_indicator_engaging_user_id',
'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_language_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',
'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',
'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',
'TE_engaged_with_user_id_reply_timestamp_indicator',
'TE_engaged_with_user_id_retweet_timestamp_indicator',
'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',
'TE_engaged_with_user_id_like_timestamp_indicator',
'TE_language_reply_timestamp_indicator',
'TE_language_retweet_timestamp_indicator',
'TE_language_retweet_with_comment_timestamp_indicator',
'TE_language_like_timestamp_indicator',
'TE_tweet_dow_retweet_timestamp_indicator',
'TE_tweet_type_reply_timestamp_indicator',
'TE_tweet_type_retweet_timestamp_indicator',
'mentioned_bucket_id',
'mentioned_count',
'second_used_word_bucket_id',
'most_used_word_bucket_id',
'has_mention',
'TE_most_used_word_bucket_id_reply_timestamp',
'TE_most_used_word_bucket_id_retweet_timestamp',
'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',
'TE_most_used_word_bucket_id_like_timestamp',
'TE_second_used_word_bucket_id_reply_timestamp',
'TE_second_used_word_bucket_id_retweet_timestamp',
'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',
'TE_second_used_word_bucket_id_like_timestamp',
'TE_mentioned_count_reply_timestamp',
'TE_mentioned_count_retweet_timestamp',
'TE_mentioned_count_retweet_with_comment_timestamp',
'TE_mentioned_count_like_timestamp',
'TE_mentioned_bucket_id_reply_timestamp',
'TE_mentioned_bucket_id_retweet_timestamp',
'TE_mentioned_bucket_id_retweet_with_comment_timestamp',
'TE_mentioned_bucket_id_like_timestamp',
'TE_has_mention_engaging_user_id_reply_timestamp',
'TE_has_mention_engaging_user_id_retweet_timestamp',
'TE_has_mention_engaging_user_id_retweet_with_comment_timestamp',
'TE_has_mention_engaging_user_id_like_timestamp',
'TE_mentioned_count_engaging_user_id_reply_timestamp',
'TE_mentioned_count_engaging_user_id_retweet_timestamp',
'TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',
'TE_mentioned_count_engaging_user_id_like_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_reply_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',
'TE_mentioned_bucket_id_engaging_user_id_like_timestamp'
]


In [11]:
len(cols)

93

In [12]:
list(cols)

['has_photo',
 'has_video',
 'has_gif',
 'a_ff_rate',
 'b_ff_rate',
 'dt_dow',
 'dt_hour',
 'reply_timestamp',
 'retweet_timestamp',
 'retweet_with_comment_timestamp',
 'like_timestamp',
 'engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaging_user_follower_count',
 'engaging_user_following_count',
 'engaged_with_user_is_verified',
 'engaging_user_is_verified',
 'GTE_language_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_language_like_timestamp_indicator_engaged_with_user_id',
 'GTE_language_reply_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_language_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_do

In [14]:
feature_list = []

In [15]:

reply =['mentioned_bucket_id',                                                             
'mentioned_count',                                                                   
'has_mention',                                                                      
'second_used_word_bucket_id',                                                      
'most_used_word_bucket_id',                                                        
'engaged_with_user_follower_count',                                                  
'engaged_with_user_following_count',                                                 
'engaged_with_user_is_verified',                                                      
'engaging_user_follower_count',                                                      
'engaging_user_following_count',                                                     
'engaging_user_is_verified',                                                                                                                         
'dt_dow',                                                                            
'dt_hour',                                                                           
'TE_language_reply_timestamp_indicator',                                           
'TE_language_retweet_timestamp_indicator',                                         
'TE_language_retweet_with_comment_timestamp_indicator',                            
'TE_language_like_timestamp_indicator',                                            
'TE_tweet_dow_retweet_timestamp_indicator',                                        
'TE_tweet_type_reply_timestamp_indicator',                                         
'TE_tweet_type_retweet_timestamp_indicator',                                       
'TE_engaged_with_user_id_reply_timestamp_indicator',                               
'TE_engaged_with_user_id_retweet_timestamp_indicator',                             
'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',                
'TE_engaged_with_user_id_like_timestamp_indicator',                                
'GTE_language_reply_timestamp_indicator_engaging_user_id',                         
'GTE_language_retweet_timestamp_indicator_engaging_user_id',                       
'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',          
'GTE_language_like_timestamp_indicator_engaging_user_id',                          
'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',                        
'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',                      
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',         
'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',                         
'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',                       
'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',                     
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',        
'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',                        
'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',                    
'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',                  
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',     
'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',                     
'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',                   
'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',                 
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',    
'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',                    
'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',                   
'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',                 
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',    
'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',                    
'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',                       
'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',                     
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',        
'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',                        
'GTE_language_reply_timestamp_indicator_engaged_with_user_id',                     
'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',                   
'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',      
'GTE_language_like_timestamp_indicator_engaged_with_user_id',                      
'has_photo',                                                                         
'has_video',                                                                         
'has_gif',                                                                           
'TE_most_used_word_bucket_id_reply_timestamp',                                     
'TE_most_used_word_bucket_id_retweet_timestamp',                                   
'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',                      
'TE_most_used_word_bucket_id_like_timestamp',                                      
'TE_second_used_word_bucket_id_reply_timestamp',                                   
'TE_second_used_word_bucket_id_retweet_timestamp',                                 
'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',                    
'TE_second_used_word_bucket_id_like_timestamp',                                    
'TE_mentioned_count_reply_timestamp',                                              
'TE_mentioned_count_retweet_timestamp',                                            
'TE_mentioned_count_retweet_with_comment_timestamp',                               
'TE_mentioned_count_like_timestamp',                                               
'TE_mentioned_bucket_id_reply_timestamp',                                          
'TE_mentioned_bucket_id_retweet_timestamp',                                        
'TE_mentioned_bucket_id_retweet_with_comment_timestamp',                           
'TE_mentioned_bucket_id_like_timestamp',                                           
'TE_has_mention_engaging_user_id_reply_timestamp',                                 
'TE_has_mention_engaging_user_id_retweet_timestamp',                               
'TE_has_mention_engaging_user_id_retweet_with_comment_timestamp',                  
'TE_has_mention_engaging_user_id_like_timestamp',                                  
'TE_mentioned_count_engaging_user_id_reply_timestamp',                             
'TE_mentioned_count_engaging_user_id_retweet_timestamp',                           
'TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',              
'TE_mentioned_count_engaging_user_id_like_timestamp',                              
'TE_mentioned_bucket_id_engaging_user_id_reply_timestamp',                         
'TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',                       
'TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',          
'TE_mentioned_bucket_id_engaging_user_id_like_timestamp',                          
'a_ff_rate',                                                                       
'b_ff_rate']                                                                       
feature_list.append(reply)

In [16]:
retweet =['second_used_word_bucket_id',
 'most_used_word_bucket_id',
 'TE_language_reply_timestamp_indicator',
 'TE_language_retweet_timestamp_indicator',
 'TE_language_retweet_with_comment_timestamp_indicator',
 'TE_language_like_timestamp_indicator',
 'TE_tweet_dow_retweet_timestamp_indicator',
 'TE_tweet_type_reply_timestamp_indicator',
 'TE_tweet_type_retweet_timestamp_indicator',
 'TE_engaged_with_user_id_reply_timestamp_indicator',
 'TE_engaged_with_user_id_retweet_timestamp_indicator',
 'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',
 'TE_engaged_with_user_id_like_timestamp_indicator',
 'GTE_language_reply_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_language_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',
 'GTE_language_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_language_like_timestamp_indicator_engaged_with_user_id',
 'has_photo',
 'has_video',
 'has_gif',
 'TE_most_used_word_bucket_id_reply_timestamp',
 'TE_most_used_word_bucket_id_retweet_timestamp',
 'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_most_used_word_bucket_id_like_timestamp',
 'TE_second_used_word_bucket_id_reply_timestamp',
 'TE_second_used_word_bucket_id_retweet_timestamp',
 'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_second_used_word_bucket_id_like_timestamp']
feature_list.append(retweet)

In [17]:
retweet_with_comment=['mentioned_bucket_id',                                                             
'mentioned_count',                                                                   
'has_mention',                                                                                                                                        
'TE_language_reply_timestamp_indicator',                                           
'TE_language_retweet_timestamp_indicator',                                         
'TE_language_retweet_with_comment_timestamp_indicator',                            
'TE_language_like_timestamp_indicator',                                            
'TE_tweet_dow_retweet_timestamp_indicator',                                        
'TE_tweet_type_reply_timestamp_indicator',                                         
'TE_tweet_type_retweet_timestamp_indicator',                                       
'TE_engaged_with_user_id_reply_timestamp_indicator',                               
'TE_engaged_with_user_id_retweet_timestamp_indicator',                             
'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',                
'TE_engaged_with_user_id_like_timestamp_indicator',                                
'GTE_language_reply_timestamp_indicator_engaging_user_id',                         
'GTE_language_retweet_timestamp_indicator_engaging_user_id',                       
'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',          
'GTE_language_like_timestamp_indicator_engaging_user_id',                          
'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',                        
'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',                      
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',         
'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',                         
'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',                       
'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',                     
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',        
'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',                        
'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',                    
'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',                  
'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',     
'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',                     
'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',                   
'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',                 
'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',    
'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',                    
'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',                   
'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',                 
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',    
'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',                    
'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',                       
'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',                     
'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',        
'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',                        
'GTE_language_reply_timestamp_indicator_engaged_with_user_id',                     
'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',                   
'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',      
'GTE_language_like_timestamp_indicator_engaged_with_user_id',                      
'has_photo',                                                                         
'has_video',                                                                         
'has_gif',                                                                           
'TE_mentioned_count_reply_timestamp',                                              
'TE_mentioned_count_retweet_timestamp',                                            
'TE_mentioned_count_retweet_with_comment_timestamp',                               
'TE_mentioned_count_like_timestamp',                                               
'TE_mentioned_bucket_id_reply_timestamp',                                          
'TE_mentioned_bucket_id_retweet_timestamp',                                        
'TE_mentioned_bucket_id_retweet_with_comment_timestamp',                           
'TE_mentioned_bucket_id_like_timestamp',                                           
'TE_has_mention_engaging_user_id_reply_timestamp',                                 
'TE_has_mention_engaging_user_id_retweet_timestamp',                               
'TE_has_mention_engaging_user_id_retweet_with_comment_timestamp',                  
'TE_has_mention_engaging_user_id_like_timestamp',                                  
'TE_mentioned_count_engaging_user_id_reply_timestamp',                             
'TE_mentioned_count_engaging_user_id_retweet_timestamp',                           
'TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp',              
'TE_mentioned_count_engaging_user_id_like_timestamp',                              
'TE_mentioned_bucket_id_engaging_user_id_reply_timestamp',                         
'TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp',                       
'TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp',          
'TE_mentioned_bucket_id_engaging_user_id_like_timestamp']                          
feature_list.append(retweet_with_comment)

In [18]:
like = ['second_used_word_bucket_id',
 'most_used_word_bucket_id',
 'engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_is_verified',
 'engaging_user_follower_count',
 'engaging_user_following_count',
 'engaging_user_is_verified',
 'dt_dow',
 'dt_hour',
 'TE_language_reply_timestamp_indicator',
 'TE_language_retweet_timestamp_indicator',
 'TE_language_retweet_with_comment_timestamp_indicator',
 'TE_language_like_timestamp_indicator',
 'TE_tweet_dow_retweet_timestamp_indicator',
 'TE_tweet_type_reply_timestamp_indicator',
 'TE_tweet_type_retweet_timestamp_indicator',
 'TE_engaged_with_user_id_reply_timestamp_indicator',
 'TE_engaged_with_user_id_retweet_timestamp_indicator',
 'TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator',
 'TE_engaged_with_user_id_like_timestamp_indicator',
 'GTE_language_reply_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_timestamp_indicator_engaging_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_language_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_hour_like_timestamp_indicator_engaging_user_id',
 'GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id',
 'GTE_tweet_type_reply_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id',
 'GTE_tweet_type_like_timestamp_indicator_engaging_user_id',
 'GTE_language_reply_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_timestamp_indicator_engaged_with_user_id',
 'GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id',
 'GTE_language_like_timestamp_indicator_engaged_with_user_id',
 'has_photo',
 'has_video',
 'has_gif',
 'TE_most_used_word_bucket_id_reply_timestamp',
 'TE_most_used_word_bucket_id_retweet_timestamp',
 'TE_most_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_most_used_word_bucket_id_like_timestamp',
 'TE_second_used_word_bucket_id_reply_timestamp',
 'TE_second_used_word_bucket_id_retweet_timestamp',
 'TE_second_used_word_bucket_id_retweet_with_comment_timestamp',
 'TE_second_used_word_bucket_id_like_timestamp',
 'a_ff_rate',
 'b_ff_rate']
feature_list.append(like)

In [19]:
list(train.columns)

['fold',
 'mentioned_bucket_id',
 'engaging_user_id',
 'mentioned_count',
 'has_mention',
 'second_used_word_bucket_id',
 'most_used_word_bucket_id',
 'mention',
 'engaged_with_user_id',
 'language',
 'has_rt',
 'tweet_type',
 'tweet_hour',
 'tweet_dow',
 'tw_last_word',
 'tw_first_word',
 'categorified_hashtags',
 'categorified_tweet_id',
 'categorified_present_media',
 'categorified_present_links',
 'categorified_present_domains',
 'categorified_tweet_type',
 'categorified_language',
 'tweet_timestamp',
 'categorified_engaged_with_user_id',
 'engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_is_verified',
 'engaged_with_user_account_creation',
 'categorified_engaging_user_id',
 'engaging_user_follower_count',
 'engaging_user_following_count',
 'engaging_user_is_verified',
 'engaging_user_account_creation',
 'engagee_follows_engager',
 'reply_timestamp',
 'retweet_timestamp',
 'retweet_with_comment_timestamp',
 'like_timestamp',
 'present_dom

In [20]:
list(valid.columns)

['mentioned_bucket_id',
 'engaging_user_id',
 'mentioned_count',
 'has_mention',
 'second_used_word_bucket_id',
 'most_used_word_bucket_id',
 'mention',
 'engaged_with_user_id',
 'language',
 'has_rt',
 'tweet_type',
 'tweet_hour',
 'tweet_dow',
 'tw_last_word',
 'hashtags',
 'present_media',
 'present_links',
 'present_domains',
 'engaged_with_user_is_verified',
 'engaging_user_is_verified',
 'engagee_follows_engager',
 'tw_first_word',
 'tweet_id',
 'tweet_timestamp',
 'engaged_with_user_follower_count',
 'engaged_with_user_following_count',
 'engaged_with_user_account_creation',
 'engaging_user_follower_count',
 'engaging_user_following_count',
 'engaging_user_account_creation',
 'reply_timestamp',
 'retweet_timestamp',
 'retweet_with_comment_timestamp',
 'like_timestamp',
 'tweet',
 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_media_engaged_with_user_is_verified_reply_timestamp',
 'TE_present_domains_language_engagee_follows_engager_tweet_type_present_med

In [21]:
drop_cols_train = []
for i in range(len(train.columns)):
    if list(train.columns)[i] not in cols:
        drop_cols_train.append(list(train.columns)[i])
drop_cols_valid = []
for i in range(len(valid.columns)):
    if list(valid.columns)[i] not in cols:
        drop_cols_valid.append(list(valid.columns)[i])

In [22]:
train = train.drop(drop_cols_train,axis=1)
train, = dask.persist(train)
valid = valid.drop(drop_cols_valid,axis=1)
valid, = dask.persist(valid)

In [23]:
len(train.columns)

93

In [24]:
len(valid.columns)

93

In [25]:
test = valid[list(train.columns)]
test, = dask.persist(test)

In [26]:
test.dtypes

mentioned_bucket_id                                                             float64
mentioned_count                                                                   int64
has_mention                                                                       int32
second_used_word_bucket_id                                                      float64
most_used_word_bucket_id                                                        float64
engaged_with_user_follower_count                                                  int32
engaged_with_user_following_count                                                 int32
engaged_with_user_is_verified                                                      bool
engaging_user_follower_count                                                      int32
engaging_user_following_count                                                     int32
engaging_user_is_verified                                                          bool
reply_timestamp                 

In [27]:
test = test.compute()
Y_valid = test[label_names]
test = test.drop(label_names,axis=1)

train = train.compute()
# train = train.compute()
Y_train = train[label_names]
train = train.drop(label_names,axis=1)


# features = [c for c in train.columns if c not in DONT_USE]
# print('Using %i features:'%(len(features)),train.shape[1])

In [28]:
if train.columns.duplicated().sum()>0:
    raise Exception(f'duplicated!: { train.columns[train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {train.shape}')
print(f'X_valid.shape {test.shape}')

no dup :) 
X_train.shape (17941349, 89)
X_valid.shape (14461760, 89)


In [29]:
%%time

for col in train.columns:
    if train[col].dtype=='bool':
        train[col] = train[col].astype('int8')
        test[col] = test[col].astype('int8')

CPU times: user 68.1 ms, sys: 15 ms, total: 83.1 ms
Wall time: 53.7 ms


In [30]:
train.head()

Unnamed: 0,mentioned_bucket_id,mentioned_count,has_mention,second_used_word_bucket_id,most_used_word_bucket_id,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_is_verified,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,dt_dow,dt_hour,TE_language_reply_timestamp_indicator,TE_language_retweet_timestamp_indicator,TE_language_retweet_with_comment_timestamp_indicator,TE_language_like_timestamp_indicator,TE_tweet_dow_retweet_timestamp_indicator,TE_tweet_type_reply_timestamp_indicator,TE_tweet_type_retweet_timestamp_indicator,TE_engaged_with_user_id_reply_timestamp_indicator,TE_engaged_with_user_id_retweet_timestamp_indicator,TE_engaged_with_user_id_retweet_with_comment_timestamp_indicator,TE_engaged_with_user_id_like_timestamp_indicator,GTE_language_reply_timestamp_indicator_engaging_user_id,GTE_language_retweet_timestamp_indicator_engaging_user_id,GTE_language_retweet_with_comment_timestamp_indicator_engaging_user_id,GTE_language_like_timestamp_indicator_engaging_user_id,GTE_tweet_dow_reply_timestamp_indicator_engaging_user_id,GTE_tweet_dow_retweet_timestamp_indicator_engaging_user_id,GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaging_user_id,GTE_tweet_dow_like_timestamp_indicator_engaging_user_id,GTE_tweet_hour_reply_timestamp_indicator_engaging_user_id,GTE_tweet_hour_retweet_timestamp_indicator_engaging_user_id,GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaging_user_id,GTE_tweet_hour_like_timestamp_indicator_engaging_user_id,GTE_tweet_dow_reply_timestamp_indicator_engaged_with_user_id,GTE_tweet_dow_retweet_timestamp_indicator_engaged_with_user_id,GTE_tweet_dow_retweet_with_comment_timestamp_indicator_engaged_with_user_id,GTE_tweet_dow_like_timestamp_indicator_engaged_with_user_id,GTE_tweet_hour_reply_timestamp_indicator_engaged_with_user_id,GTE_tweet_hour_retweet_timestamp_indicator_engaged_with_user_id,GTE_tweet_hour_retweet_with_comment_timestamp_indicator_engaged_with_user_id,GTE_tweet_hour_like_timestamp_indicator_engaged_with_user_id,GTE_tweet_type_reply_timestamp_indicator_engaged_with_user_id,GTE_tweet_type_retweet_timestamp_indicator_engaged_with_user_id,GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaged_with_user_id,GTE_tweet_type_like_timestamp_indicator_engaged_with_user_id,GTE_tweet_type_reply_timestamp_indicator_engaging_user_id,GTE_tweet_type_retweet_timestamp_indicator_engaging_user_id,GTE_tweet_type_retweet_with_comment_timestamp_indicator_engaging_user_id,GTE_tweet_type_like_timestamp_indicator_engaging_user_id,GTE_language_reply_timestamp_indicator_engaged_with_user_id,GTE_language_retweet_timestamp_indicator_engaged_with_user_id,GTE_language_retweet_with_comment_timestamp_indicator_engaged_with_user_id,GTE_language_like_timestamp_indicator_engaged_with_user_id,has_photo,has_video,has_gif,TE_most_used_word_bucket_id_reply_timestamp,TE_most_used_word_bucket_id_retweet_timestamp,TE_most_used_word_bucket_id_retweet_with_comment_timestamp,TE_most_used_word_bucket_id_like_timestamp,TE_second_used_word_bucket_id_reply_timestamp,TE_second_used_word_bucket_id_retweet_timestamp,TE_second_used_word_bucket_id_retweet_with_comment_timestamp,TE_second_used_word_bucket_id_like_timestamp,TE_mentioned_count_reply_timestamp,TE_mentioned_count_retweet_timestamp,TE_mentioned_count_retweet_with_comment_timestamp,TE_mentioned_count_like_timestamp,TE_mentioned_bucket_id_reply_timestamp,TE_mentioned_bucket_id_retweet_timestamp,TE_mentioned_bucket_id_retweet_with_comment_timestamp,TE_mentioned_bucket_id_like_timestamp,TE_has_mention_engaging_user_id_reply_timestamp,TE_has_mention_engaging_user_id_retweet_timestamp,TE_has_mention_engaging_user_id_retweet_with_comment_timestamp,TE_has_mention_engaging_user_id_like_timestamp,TE_mentioned_count_engaging_user_id_reply_timestamp,TE_mentioned_count_engaging_user_id_retweet_timestamp,TE_mentioned_count_engaging_user_id_retweet_with_comment_timestamp,TE_mentioned_count_engaging_user_id_like_timestamp,TE_mentioned_bucket_id_engaging_user_id_reply_timestamp,TE_mentioned_bucket_id_engaging_user_id_retweet_timestamp,TE_mentioned_bucket_id_engaging_user_id_retweet_with_comment_timestamp,TE_mentioned_bucket_id_engaging_user_id_like_timestamp,a_ff_rate,b_ff_rate
0,1.0,1,1,4.0,20.0,12672,8414,1,9682,947,0,2,13,0.033017,0.093115,0.008658,0.377179,0.086652,0.031326,0.056429,0.014953,0.084848,0.011013,0.251729,0.015432,0.049697,0.003838,0.369375,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.021605,0.269575,0.005373,0.517125,0.00831,0.180606,0.07899,0.275817,0.005401,0.167394,0.001343,0.329281,0.027007,0.086969,0.006716,0.396406,0.012044,0.101607,0.011162,0.202018,0,0,0,0.026982,0.08633,0.006726,0.39646,0.031679,0.079484,0.006499,0.411269,0.074506,0.046827,0.006425,0.3792,0.074506,0.046827,0.006425,0.3792,0.025699,0.082729,0.006403,0.37743,0.026984,0.086866,0.006723,0.396301,0.026984,0.086866,0.006723,0.396301,0.663984,10.223865
1,1.0,1,1,1.0,20.0,397,344,0,66,142,0,3,22,0.033017,0.093115,0.008658,0.377179,0.08752,0.037969,0.078495,0.142267,0.089422,0.021402,0.583549,0.210803,0.034788,0.002686,0.258562,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.021605,0.069575,0.005373,0.517125,0.027007,0.086969,0.006716,0.396406,0.277638,0.015813,0.046676,0.572074,0.027007,0.086969,0.006716,0.396406,0.219136,0.009402,0.027753,0.691503,0,0,0,0.026982,0.08633,0.006726,0.39646,0.046066,0.05875,0.006687,0.414817,0.074506,0.046827,0.006425,0.3792,0.074506,0.046827,0.006425,0.3792,0.026984,0.086866,0.006723,0.396301,0.026984,0.086866,0.006723,0.396301,0.026984,0.086866,0.006723,0.396301,0.866499,0.464789
2,1.0,1,1,19.0,20.0,103,53,0,347,263,0,4,1,0.031388,0.069589,0.010858,0.420874,0.086362,0.005755,0.110378,0.051338,0.124646,0.037811,0.56427,0.013503,0.043484,0.003358,0.323203,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.079145,0.167705,0.073347,0.756116,0,0,0,0.026982,0.08633,0.006726,0.39646,0.021866,0.092883,0.006697,0.376858,0.074506,0.046827,0.006425,0.3792,0.074506,0.046827,0.006425,0.3792,0.026984,0.086866,0.006723,0.396301,0.026984,0.086866,0.006723,0.396301,0.026984,0.086866,0.006723,0.396301,0.514563,1.319392
3,1.0,1,1,1.0,20.0,8197,5598,0,4110,2097,0,3,9,0.027108,0.078669,0.007454,0.384642,0.08752,0.037969,0.078495,0.010591,0.132145,0.002634,0.214277,0.060229,0.067082,0.029339,0.273875,0.027007,0.086969,0.006716,0.396406,0.018004,0.057979,0.004477,0.26427,0.010803,0.134788,0.002686,0.158562,0.027007,0.086969,0.006716,0.396406,0.004321,0.173915,0.001075,0.183425,0.058317,0.018309,0.001414,0.346612,0.003601,0.178263,0.000895,0.086187,1,0,0,0.026982,0.08633,0.006726,0.39646,0.046066,0.05875,0.006687,0.414817,0.074506,0.046827,0.006425,0.3792,0.074506,0.046827,0.006425,0.3792,0.066942,0.075535,0.005846,0.388088,0.073318,0.082729,0.006403,0.37743,0.073318,0.082729,0.006403,0.37743,0.682933,1.959943
4,1.0,1,1,11.0,20.0,1798,85,0,791,993,0,1,7,0.03099,0.092389,0.00757,0.383898,0.089205,0.037969,0.078495,0.027007,0.086969,0.006716,0.396406,0.016058,0.034027,0.000389,0.298342,0.003858,0.012424,0.000959,0.306629,0.006355,0.079287,0.00158,0.740331,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.027007,0.086969,0.006716,0.396406,0.023165,0.014812,0.011284,0.566875,0.027007,0.086969,0.006716,0.396406,0,0,0,0.026982,0.08633,0.006726,0.39646,0.025691,0.087658,0.006486,0.390644,0.074506,0.046827,0.006425,0.3792,0.074506,0.046827,0.006425,0.3792,0.042769,0.048259,0.031513,0.331278,0.026984,0.086866,0.006723,0.396301,0.026984,0.086866,0.006723,0.396301,0.047275,0.796576


In [31]:
#label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [69]:
params_rely = {
        'num_leaves': 68,
        'learning_rate': 0.014288282921897474,
        'max_depth': 22,
        'lambda_l1': 50.0,
        'lambda_l2': 50.0,
        'colsample_bynode': 0.8,
        'colsample_bytree': 0.20327140473758715,
        'bagging_fraction': 0.8,
        'bagging_freq': 7,
        'metric':'average_precision',
        'max_bin': 106,
        'min_data_in_leaf': int(1282.3912530172006),
        'early_stopping_rounds':20,
        }

params_retweet = {
        'num_leaves': 67,
        'learning_rate': 0.03377589059708651,
        'max_depth': 50,
        'lambda_l1': 50.0,
        'lambda_l2': 50.0,
        'colsample_bynode': 0.8,
        'colsample_bytree': 0.3291900831690586,
        'bagging_fraction': 0.8,
        'bagging_freq': 7,
        'metric':'average_precision',
        'max_bin': 200,
        'min_data_in_leaf': int(1282.3912530172006),
        'early_stopping_rounds':20,
        }

params_comment = {
        'num_leaves': 189,
        'learning_rate': 0.01102516875432016,
        'max_depth': 30,
        'lambda_l1': 50.0,
        'lambda_l2': 50.0,
        'colsample_bynode': 0.8,
        'colsample_bytree':  0.24030161016241586,
        'bagging_fraction': 0.8,
        'bagging_freq': 7,
        'metric':'average_precision',
        'max_bin': 107,
        'min_data_in_leaf': int(1282.3912530172006),
        'early_stopping_rounds':20,
        }

params_like = {
        'num_leaves': 114,
        'learning_rate': 0.04520504200705504,
        'max_depth': 36,
        'lambda_l1': 50.0,
        'lambda_l2': 50.0,
        'colsample_bynode': 0.8,
        'colsample_bytree':  0.291807900318132,
        'bagging_fraction': 0.8,
        'bagging_freq': 7,
        'metric':'average_precision',
        'max_bin': 139,
        'min_data_in_leaf': int(1282.3912530172006),
        'early_stopping_rounds':20,
        }
paramss = [params_rely,params_retweet,params_comment,params_like]

In [70]:
import lightgbm as lgb
oof = np.zeros((len(test),len(label_names)))
preds = []
models = []

numlabel = 0
name = label_names[numlabel]
X_train = train[feature_list[numlabel]]
X_valid = test[feature_list[numlabel]]
print('#'*25);print('###',name);print('#'*25)

trainD = lgb.Dataset(data=X_train,label=Y_train.iloc[:, numlabel],categorical_feature=set([]))
validationD = lgb.Dataset(data=X_valid, label=Y_valid.iloc[:, numlabel], categorical_feature=set([]))
model = lgb.train(paramss[numlabel],train_set=trainD,valid_sets=validationD,categorical_feature=set([]))

start = time.time(); print('Predicting...')
#Y_valid[f'pred_{name}'] = xgb.dask.predict(client,model,valid)
oof[:, numlabel] += model.predict(X_valid)
#preds.append(xgb.dask.predict(client,model,valid))
print('Took %.1f seconds'%(time.time()-start))
models.append(model)

del trainD,validationD

for numlabel in range(4):
    name = label_names[numlabel]
    X_train = train[feature_list[numlabel]]
    X_valid = test[feature_list[numlabel]]
    print('#'*25);print('###',name);print('#'*25)
    
    trainD = lgb.Dataset(data=X_train,label=Y_train.iloc[:, numlabel],categorical_feature=set([]))
    validationD = lgb.Dataset(data=X_valid, label=Y_valid.iloc[:, numlabel], categorical_feature=set([]))
    model = lgb.train(paramss[numlabel],train_set=trainD,valid_sets=validationD,categorical_feature=set([]))
    
    start = time.time(); print('Predicting...')
    #Y_valid[f'pred_{name}'] = xgb.dask.predict(client,model,valid)
    oof[:, numlabel] += model.predict(X_valid)
    #preds.append(xgb.dask.predict(client,model,valid))
    print('Took %.1f seconds'%(time.time()-start))
    models.append(model)

    del trainD,validationD


#########################
### reply_timestamp
#########################




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7779
[LightGBM] [Info] Number of data points in the train set: 17941349, number of used features: 88
[LightGBM] [Info] Start training from score 0.026990
[1]	valid_0's average_precision: 0.0287312
Training until validation scores don't improve for 20 rounds
[2]	valid_0's average_precision: 0.0297187
[3]	valid_0's average_precision: 0.0317293
[4]	valid_0's average_precision: 0.0361131
[5]	valid_0's average_precision: 0.0590128
[6]	valid_0's average_precision: 0.0767537
[7]	valid_0's average_precision: 0.0864527
[8]	valid_0's average_precision: 0.0832833
[9]	valid_0's average_precision: 0.0847012
[10]	valid_0's average_precision: 0.100369
[11]	valid_0's average_precision: 0.0887989
[12]	valid_0's average_precision: 0.0751699
[13]	valid_0's average_precision: 0.082481
[14]	valid_0's average_precision: 0.0859022
[15]	valid_0's average_precis

In [52]:
yvalid = Y_valid[label_names].values

# Compute Validation Metrics

In [48]:
from sklearn.metrics import precision_recall_curve, auc, log_loss
from sklearn.metrics import average_precision_score

def compute_prauc(pred, gt):
  prec, recall, thresh = precision_recall_curve(gt, pred)
  prauc = auc(recall, prec)
  return prauc

def compute_AP(pred, gt):
  return average_precision_score(gt, pred)

def calculate_ctr(gt):
  positive = len([x for x in gt if x == 1])
  ctr = positive/float(len(gt))
  return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

# FAST METRIC FROM GIBA
def compute_rce_fast(pred, gt):
    cross_entropy = log_loss(gt, pred)
    yt = np.mean(gt)     
    strawman_cross_entropy = -(yt*np.log(yt) + (1 - yt)*np.log(1 - yt))
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [71]:
%%time
txt = ''
for i in range(4):
    ap = compute_AP(oof[:,i], yvalid[:, i])
    rce   = compute_rce_fast(oof[:,i], yvalid[:, i])
    txt_ = f"{label_names[i]:20} AP:{ap:.5f} RCE:{rce:.5f}"
    print(txt_)
    txt += txt_ + '\n'

reply_timestamp      AP:0.13698 RCE:11.27361
retweet_timestamp    AP:0.08055 RCE:-893.21991
retweet_with_comment_timestamp AP:0.00554 RCE:-457.69205
like_timestamp       AP:0.40010 RCE:-1953.18056
CPU times: user 19 s, sys: 588 ms, total: 19.6 s
Wall time: 19.1 s


In [37]:
# list(test.columns)

In [38]:
print('This notebook took %.1f minutes'%((time.time()-very_start)/60.))

This notebook took 16.8 minutes


In [39]:
print(f'X_valid.shape {oof.shape}')

X_valid.shape (14461760, 4)


In [72]:
for i in range(4):
    name = label_names[i]
    models[i].save_model(filename = "models/"+name+".txt")

<lightgbm.basic.Booster at 0x7f4bd0369390>

In [73]:
import lightgbm as lgb
label_names = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [74]:
oof = np.zeros((len(test),len(label_names)))
i=0
X_valid = test[feature_list[i]]
name = label_names[i]
filename = "models1/"+name+".txt"
lgbm = lgb.Booster(model_file=filename)
oof[:, i] += lgbm.predict(X_valid)
del lgbm

In [61]:
txt = ''
for i in range(4):
    ap = compute_AP(oof[:,i], yvalid[:, i])
    rce   = compute_rce_fast(oof[:,i], yvalid[:, i])
    txt_ = f"{label_names[i]:20} AP:{ap:.5f} RCE:{rce:.5f}"
    print(txt_)
    txt += txt_ + '\n'

reply_timestamp      AP:0.14159 RCE:6.23012
retweet_timestamp    AP:0.08055 RCE:-893.21991
retweet_with_comment_timestamp AP:0.00554 RCE:-457.69205
like_timestamp       AP:0.40010 RCE:-1953.18056
