In [79]:

import gc
import os
import time
from datetime import datetime

import numpy as np
import pandas as pd
from dask.distributed import Client
from sklearn.metrics import auc, log_loss, precision_recall_curve
from sklearn.model_selection import KFold

FILE_NAME_train = "/mnt/DP_disk3/Recsys/train-1.parquet"
FILE_NAME_test = "/mnt/DP_disk3/Recsys/test-0.parquet"
FILE_NAME_text = "/mnt/DP_disk3/Recsys/text-processings-1.parquet"
FILE_NAME_user = "/mnt/DP_disk3/Recsys/a_count_combined-final.parquet"



In [80]:
label_names = ["reply", "retweet", "retweet_comment", "like"]
train = pd.read_parquet(FILE_NAME_train)

In [81]:
train.dtypes

hashtags              object
tweet_id               int64
media                  uint8
links                 object
domains               object
tweet_type             uint8
language               uint8
timestamp              int64
a_user_id              int64
a_follower_count       int64
a_following_count      int64
a_is_verified           bool
a_account_creation     int64
b_user_id              int64
b_follower_count       int64
b_following_count      int64
b_is_verified           bool
b_account_creation     int64
b_follows_a             bool
reply                  int64
retweet                int64
retweet_comment        int64
like                   int64
id                     int64
len_hashtags           int64
len_domains            int64
len_links              int64
dtype: object

In [69]:
train.head()['timestamp']

0    1581131751
1    1581527100
2    1580979604
3    1580993048
4    1581143484
Name: timestamp, dtype: int64

In [70]:
train['timestamp2'] = train['timestamp']

In [71]:

train['timestamp'] = pd.to_datetime(train['timestamp'], unit="s")

In [72]:
train.head()['timestamp']

0   2020-02-08 03:15:51
1   2020-02-12 17:05:00
2   2020-02-06 09:00:04
3   2020-02-06 12:44:08
4   2020-02-08 06:31:24
Name: timestamp, dtype: datetime64[ns]

In [73]:
train['timestamp'] = train['timestamp'].astype("int64") / 1e9

In [74]:
train.head()['timestamp']

0    1.581132e+09
1    1.581527e+09
2    1.580980e+09
3    1.580993e+09
4    1.581143e+09
Name: timestamp, dtype: float64

In [75]:
train.head()['timestamp2']

0    1581131751
1    1581527100
2    1580979604
3    1580993048
4    1581143484
Name: timestamp2, dtype: int64

In [77]:
train["timestamp2"] = train["timestamp2"].astype("int64") / 1e9
#test["timestamp"] = test["timestamp"].astype("int64") / 1e9


In [78]:
train.head()['timestamp2']

0    1.581132
1    1.581527
2    1.580980
3    1.580993
4    1.581143
Name: timestamp2, dtype: float64

In [82]:
train["timestamp"] = train["timestamp"].map(datetime.utcfromtimestamp)
##test["timestamp"] = test["timestamp"].map(datetime.utcfromtimestamp)
#split_time(train)
#split_time(test)


In [83]:
train.dtypes

hashtags                      object
tweet_id                       int64
media                          uint8
links                         object
domains                       object
tweet_type                     uint8
language                       uint8
timestamp             datetime64[ns]
a_user_id                      int64
a_follower_count               int64
a_following_count              int64
a_is_verified                   bool
a_account_creation             int64
b_user_id                      int64
b_follower_count               int64
b_following_count              int64
b_is_verified                   bool
b_account_creation             int64
b_follows_a                     bool
reply                          int64
retweet                        int64
retweet_comment                int64
like                           int64
id                             int64
len_hashtags                   int64
len_domains                    int64
len_links                      int64
d

In [84]:
train.head()['timestamp']

0   2020-02-08 03:15:51
1   2020-02-12 17:05:00
2   2020-02-06 09:00:04
3   2020-02-06 12:44:08
4   2020-02-08 06:31:24
Name: timestamp, dtype: datetime64[ns]

In [85]:
train["timestamp"] = train["timestamp"].astype("int64") / 1e9
#test["timestamp"] = test["timestamp"].astype("int64") / 1e9



In [86]:
train.head()['timestamp']

0    1.581132e+09
1    1.581527e+09
2    1.580980e+09
3    1.580993e+09
4    1.581143e+09
Name: timestamp, dtype: float64

In [None]:
for c in label_names:
    train.loc[train[c] == 0, c] = np.nan


In [None]:
train["engage_time"] = train[label_names].min(1)

In [30]:
train.dtypes

hashtags               object
tweet_id                int64
media                   uint8
links                  object
domains                object
tweet_type              uint8
language                uint8
timestamp             float64
a_user_id               int64
a_follower_count        int64
a_following_count       int64
a_is_verified            bool
a_account_creation      int64
b_user_id               int64
b_follower_count        int64
b_following_count       int64
b_is_verified            bool
b_account_creation      int64
b_follows_a              bool
reply                 float64
retweet               float64
retweet_comment       float64
like                  float64
id                      int64
len_hashtags            int64
len_domains             int64
len_links               int64
engage_time           float64
dtype: object

In [42]:
train.head()[['engage_time','timestamp']]

Unnamed: 0,engage_time,timestamp
0,1581156000.0,1.581132
1,,1.581527
2,1580992000.0,1.58098
3,1580993000.0,1.580993
4,,1.581143


In [32]:
train.head()[['engage_time','timestamp']]

Unnamed: 0,engage_time,timestamp
0,1581156000.0,1581132000.0
1,,1581527000.0
2,1580992000.0,1580980000.0
3,1580993000.0,1580993000.0
4,,1581143000.0


In [43]:
train["elapsed_time"] = train["engage_time"] - train["timestamp"]


In [44]:
train.head()[['elapsed_time','engage_time','timestamp']]

Unnamed: 0,elapsed_time,engage_time,timestamp
0,1581156000.0,1581156000.0,1.581132
1,,,1.581527
2,1580992000.0,1580992000.0,1.58098
3,1580993000.0,1580993000.0,1.580993
4,,,1.581143


In [22]:
train.head()

Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,...,b_follows_a,reply,retweet,retweet_comment,like,id,len_hashtags,len_domains,len_links,engage_time
0,7406650BAE78F56FBD8068FC460A6A1E,0,0,81DE6DAEA33235BD3AEE98A381D79172,DBB05735DF475AB5DABEE791299AE3D1,2,54,1581132000.0,0,314265,...,False,,,,1581156000.0,0,1,1,1,1581156000.0
1,,1,5,,,2,9,1581527000.0,1,43089,...,False,,,,,1,0,0,0,
2,,2,0,,,2,38,1580980000.0,2,60258,...,False,,,,1580992000.0,2,0,0,0,1580992000.0
3,,3,9,,,1,38,1580993000.0,3,119321,...,False,,,,1580993000.0,3,0,0,0,1580993000.0
4,,4,7,,,1,47,1581143000.0,4,3106126,...,False,,,,,4,0,0,0,


In [None]:
time = "2020-02-08 03:15:51"
time

In [57]:
dft = pd.DataFrame([{'datetime':'2020-02-08 03:15:51', 'col2':'1'}, {'datetime':'2020-02-10 03:15:51', 'col2':'2'}])


In [65]:
dft['datetime'] = dft['datetime'].astype(np.uint32)

ValueError: invalid literal for int() with base 10: '2020-02-08 03:15:51'

In [62]:
dft['datetime'] = dft['datetime'].astype("int64") / 1

ValueError: invalid literal for int() with base 10: '2020-02-08 03:15:51'