In [1]:
# import stuff
#  %%time

%config IPCompleter.greedy=True

import os
os.environ['http_proxy']=''
os.environ['https_proxy']=''

#import modin.config
#modin.config.Backend.put('omnisci')

#import modin.pandas as pd
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import json
from pathlib import Path
import timeit
hdfs = pa.hdfs.connect()
# hdfs = pa.fs.HadoopFileSystem(host="localhost")



In [2]:
# df = hdfs.read_parquet("/updated_parquet/updated_parquet/part-00000.parquet").to_pandas()
#df = pq.read_table("/updated_parquet/updated_parquet/part-00000.parquet", filesystem = hdfs).to_pandas()

In [None]:
from datetime import datetime, timedelta
dstart = datetime.strptime("2021-02-03 16:00:00", "%Y-%m-%d %H:%M:%S")
print(dstart)
import numpy as np
valend = dstart + timedelta(days=2)
testend = dstart + timedelta(days=18)
trainend = valend + timedelta(days=18)

trains = []
vals = []
tests = []

file_idxs = list(range(20)) + list(range(180, 199))
for idx in file_idxs:
    idxs = str(idx).zfill(3)
    df = hdfs.read_parquet(f"/updated_parquet/updated_parquet/part-00{idxs}.parquet").to_pandas()
    df['tweet_datetime'] = pd.to_datetime(df['tweet_timestamp'], unit='s')
    
    val_mask = df['tweet_datetime'] < valend
    test_mask = df['tweet_datetime'] > testend
    train_mask = np.logical_and((df['tweet_datetime'] > valend), df['tweet_datetime'] < trainend)
    
    trains.append(df[train_mask])
    vals.append(df[val_mask])
    tests.append(df[test_mask])
    

train = pd.concat(trains)
test = pd.concat(tests)
val = pd.concat(vals)

# "/updated_parquet/updated_parquet/part-00000.parquet"


2021-02-03 16:00:00


In [None]:
test.shape, train.shape, val.shape

In [None]:
# non-estimator functions.  Thanks to Visilij for porting.
def prep_tsv_columns(df):
    media = df['present_media'].fillna('')
    df['has_photo'] = media.str.contains('Photo').astype('int8')
    df['has_video'] = media.str.contains('Video').astype('int8')
    df['has_gif'] = media.str.contains('GIF').astype('int8')
    return df

def get_rates(df):
    df['engaged_user_rate'] = df["engaged_with_user_following_count"] / df["engaged_with_user_follower_count"]
    df['engaging_user_rate'] = df["engaging_user_following_count"] / df["engaging_user_follower_count"]
    return df

def prep_bool_cols(df):
    bool_columns = ['engagee_follows_engager', "engaged_with_user_is_verified", 'engaging_user_is_verified']
    for c in bool_columns:
        df[f'{c}_indicator'] = df[c].astype('int8')
        
    return df
        
def prep_datetime_columns(df):
    df['tweet_datetime'] = pd.to_datetime(df['tweet_timestamp'], unit='s')
    df['tweet_hour'] = df['tweet_datetime'].dt.hour
    df['tweet_dow'] = df['tweet_datetime'].dt.dayofweek
    return df
        
def feature_generation(df):
    df['both_verified'] = (df["engaging_user_is_verified"] & df["engaged_with_user_is_verified"]).astype('int8')
    return df

def prep_response(df):
    cols = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
    for c in cols:
        df[f'{c}_indicator'] = df[c].notnull().astype('int8')
    return df
        
def most_common_value(df, index, cat_column):
    w = Window.partitionBy(index, cat_column)
    df_wrn = df.withColumn("count", F.count("*").over(w)).withColumn('row_number', F.row_number().over(w.orderBy(F.desc("count"))))
    df_wrn = df_wrn.filter("row_number = 1")
    return df_wrn

def language_indicator(df):
    dfagg = df.groupby('engaging_user_id').agg({'language': ['first']})
    dfagg.columns = dfagg.columns.to_flat_index()
    dfagg = dfagg.rename(columns={('language', 'first'): 'engaging_user_language'})

    df = df.join(dfagg, on='engaging_user_id', how='inner')
    df['same_languages'] = (df['engaging_user_language'] == df['language']).astype('int8')
    return df

def all_prep(df):
    df['engaging_user_is_verified'] = df['enaging_user_is_verified']
    df['engaging_user_following_count'] = df['enaging_user_following_count']
    df = prep_tsv_columns(df)
    df = prep_bool_cols(df)
    df = feature_generation(df)
    df = prep_datetime_columns(df)
    df = get_rates(df)
    # df = language_indicator(df)
    df = prep_response(df)
    return df

In [None]:
#Model object stub to make joins easier


def intersect(x1, x2):

    values = []
    for idx in range(len(x1)):
        if x1[idx] is None or x2[idx] is None:
            values.append(0)
        else:
            xset = set(str(x1[idx]).split("\t"))
            row_intersect = xset.intersection(x2[idx])

            values.append(row_intersect)
    
    return values
    
class Model():
    def __init__(self, name, join_cols=None):
        
        #jsons = list(Path(f"training/{name}.json").glob(f"**/*.json"))

        #there should only be one json file... coalesced in spark from one row
#         with open(jsons[0]) as json_file:
#             self.params = json.load(json_file)
        
        # self.df_agg = pq.read_table(f"training/{name}.parquet").to_pandas().drop_duplicates()
        # self.df_agg = pq.read_table(f"training/{name}.parquet", filesystem=hdfs).to_pandas()

        self.df_agg = pq.ParquetDataset(f"/training/{name}.parquet", filesystem = hdfs).read().to_pandas()
        self.join_cols = join_cols
        
    def set_join_cols(cols):
        self.join_cols = cols
        
    def transform(self, data):
        return pd.merge(data, self.df_agg, on=self.join_cols, how='left')
    
#Model object stub to make joins easier
class CPD_Model():
    def __init__(self, name, inputCol = None, outputCol=None, join_cols=None):
        
        jsons = list(Path(f"training/{name}.json").glob(f"**/*.json"))

        #there should only be one json file... coalesced in spark from one row
        with open(jsons[0]) as json_file:
            self.params = json.load(json_file)
        
        self.df_agg = hdfs.read_table(f"training/{name}.parquet").to_pandas()
        
        self.join_cols = join_cols
        self.inputCol = inputCol
        self.outputCol = outputCol
        
    def set_join_cols(cols):
        self.join_cols = cols
        
    def transform(self, data):
        alldf = pd.merge(data, self.df_agg, on=self.join_cols, how='left')
        alldf[f'CPD_{self.inputCol}_{self.outputCol}'] = alldf.apply(lambda x: intersect(alldf[f"{self.inputCol}_{self.inputCol}_{self.outputCol}_intersection_unique"], alldf[self.inputCol]))
    
    
    
    #merge the stored data with new data and calucate the intersection
#         dataset = dataset.join(self.agg_all.select(self.indexCol, f"{self.inputCol}_{self.outputCol}_unique"), on=[self.indexCol], how='left')
#         dataset = dataset.withColumn(self.outputCol, F.size(F.array_intersect(f"{self.inputCol}_{self.outputCol}_unique", F.split(F.col(self.inputCol), '\t'))))
#         dataset = dataset.fillna({self.outputCol: 0.0})
#         return dataset

In [None]:
gte_tweet_type = Model("GMTE_tweet_type_engaged_with_user_id", join_cols=['engaging_user_id', 'tweet_type'])


In [None]:
gte_tweet_type.df_agg.head()

In [None]:
# insantiate the Model objects.  They will load data from "./training" directory
te_engaging_user = Model("TE_engaging_user_id", join_cols=['engaging_user_id'])
te_language = Model("TE_language", join_cols=['language'])
te_tweet_type = Model("TE_tweet_type", join_cols=['tweet_type'])
te_engaged_user = Model("TE_engaged_with_user_id", join_cols=['engaged_with_user_id'])
te_tweet_dow = Model("TE_tweet_dow", join_cols=['tweet_dow'])
te_tweet_hour = Model("TE_tweet_hour", join_cols=['tweet_hour'])
te_engaging_user = Model("TE_engaging_user_id", join_cols = ['engaging_user_id'])



model_list = [te_engaging_user, te_language, te_tweet_type, te_engaged_user, te_tweet_dow, te_tweet_hour, te_engaging_user] 
#              gte_engaged_with_user_id, gte_language, gte_tweet_dow, gte_tweet_hour, gte_tweet_type]

In [None]:
# gte_engaged_with_user_id = Model("GMTE_engaged_with_user_id", join_cols=['engaging_user_id', 'engaged_with_user_id'])
# gte_language = Model("GMTE_language", join_cols = ['engaging_user_id', 'language'])
# gte_tweet_dow = Model("GMTE_tweet_dow", join_cols=['engaging_user_id', 'tweet_dow'])
# gte_tweet_hour = Model("GMTE_tweet_hour", join_cols=['engaging_user_id', 'tweet_hour'])
# gte_tweet_type = Model("GMTE_tweet_type", join_cols=['engaging_user_id', 'tweet_type'])

gte_engaged_with_user_id_euid = Model("GMTE_engaged_with_user_id_engaging_user_id", join_cols = ['engaged_with_user_id', 'engaging_user_id'])
# gte_has_rt_engaged_with_user_id = Model("GMTE_has_rt_engaged_with_user_id", join_cols = ['has_rt', 'engaged_with_user_id'])
gte_language_engaged_with_user_id = Model("GMTE_language_engaged_with_user_id", join_cols = ['language', 'engaged_with_user_id'])
gte_language_engaging_user_id = Model("GMTE_language_engaging_user_id", join_cols = ['language', 'engaging_user_id'])
gte_tweet_dow_ewuid = Model("GMTE_tweet_dow_engaged_with_user_id", join_cols = ['tweet_dow', 'engaged_with_user_id'])
gte_tweet_dow_euid = Model("GMTE_tweet_dow_engaging_user_id", join_cols = ['tweet_dow', 'engaging_user_id'])
gte_tweet_hour_engaged_with_user_id = Model("GMTE_tweet_hour_engaged_with_user_id", join_cols = ['tweet_hour', 'engaged_with_user_id'])
gte_tweet_hour_euid = Model("GMTE_tweet_hour_engaging_user_id", join_cols = ['tweet_hour', 'engaging_user_id'])
gte_tweet_type_ewuid = Model("GMTE_tweet_type_engaged_with_user_id", join_cols = ['tweet_type', 'engaged_with_user_id'])
gte_tweet_type_euid = Model("GMTE_tweet_type_engaging_user_id", join_cols = ['tweet_type', 'engaging_user_id'])




In [None]:
model_list = model_list + [gte_engaged_with_user_id_euid,gte_language_engaged_with_user_id,
                          gte_language_engaging_user_id,gte_tweet_dow_ewuid,gte_tweet_dow_euid,gte_tweet_hour_engaged_with_user_id,
                          gte_tweet_hour_euid,gte_tweet_type_ewuid,gte_tweet_type_euid]

In [None]:
#call transformer code
prep_start = timeit.default_timer()
train = all_prep(train)
test = all_prep(test)
val = all_prep(val)
prep_end = timeit.default_timer()

print(f"prep rate {train.shape[0]/(prep_end - prep_start)} rows/second")


In [None]:
#now perform target embedding lookup
start = timeit.default_timer()
# feature_engineered_df = df_transformed
print(train.shape)
for m in model_list:
    train = m.transform(train)
    test = m.transform(test)
    val = m.transform(val)
    # feature_engineered_df = m.transform(feature_engineered_df)
    print(train.shape)

end = timeit.default_timer()
print(f"merge took {end - start} seconds")
     
print(feature_engineered_df.columns)

In [None]:
#next will be xgboost modelling.  To be completed after the rest of the pipeline is complete.
for c in train.columns:
    print(c)

In [None]:
cols = train.columns
idx_start = train.columns.get_loc("engaging_user_following_count")
idx_end = train.columns.get_loc("engaging_user_rate")
fcols = cols[idx_start:idx_end+1].to_list()

idx_final = cols.get_loc("TE_engaging_user_id_reply_timestamp_indicator_x")

fcols = fcols + cols[idx_final:].to_list()

#fcols.remove('engaging_user_language')
#fcols.remove("tweet_datetime")
print(fcols)


In [None]:
# I am going to fit an XGBoost model in this stage.  In the real inference case we will load a serialized model from disk.
# *NOTE* disregard the scores... they are only to evaluate if optimization contaminates the feature and not relevent to the whole moodel.
response_columns = ['retweet_timestamp_indicator', 'reply_timestamp_indicator',
       'retweet_with_comment_timestamp_indicator', 'like_timestamp_indicator']
from xgboost import XGBClassifier
import timeit
import numpy as np
from sklearn.metrics import average_precision_score, log_loss
model_dict = {}
X = train.loc[:, fcols]
Xv = val.loc[:, fcols]
Xt = test.loc[:, fcols]
for response in response_columns:
    xgb = XGBClassifier(max_depth=6, n_estimators=250, learning_rate = 0.1, n_jobs=8, num_parallel_tree  = 1, 
                        tree_method='hist', subsample = 0.8, reg_alpha = 0.1, reg_lambda = 0.01, colsample_bytree=0.7)
    # mask = np.random.choice([True, False], size=feature_engineered_df.shape[0], p=[0.8, 0.2])

    y = train[response]
    
    Xtrain = X
    ytrain = y
    
    Xtest = Xv
    ytest = test[response]
    
    xgb.fit(Xtrain, ytrain, eval_metric="aucpr", eval_set=[(Xv, val[response])], early_stopping_rounds=10)
    model_dict[response] = xgb
    
    inf_start = timeit.default_timer()
    pred = xgb.predict_proba(Xt)
    inf_end = timeit.default_timer()
    print(f"Inference rate on {response} is {len(pred)/(inf_end - inf_start)} samples/sec")
    print(f"AP score of {response} is {average_precision_score(ytest, pred[:, 1])}")
    

In [None]:
y_m = np.zeros(shape=X.shape[0])
retweet_mask = feature_engineered_df['retweet_timestampe_indicator'] == 1
y_m[retweet_mask] = 1
retweetwc_mask = feature_engineered_df['retweet_with_comment_timestamp_indicator'] == 1
y_m[retweetwc_mask] = 2

xgb = XGBClassifier(num_class = 3, max_depth=3, n_estimators=250, learning_rate = 0.1, n_jobs=8, objective = 'multi:softmax', tree_method='hist', parallel_trees=10)
mask = np.random.choice([True, False], size=feature_engineered_df.shape[0], p=[0.2, 0.8])

Xtrain = X.iloc[mask, :]
ytrain = y_m[mask]

Xtest = X.iloc[~mask, :]
ytest = y_m[~mask]

xgb.fit(Xtrain, ytrain.reshape(-1,1))

In [None]:
pred = xgb.predict(Xtest)
print(np.unique(pred))
# print(f"{average_precision_score(ytest, pred)}")

In [None]:
y_f = pred.astype(int) == 1
y_f = y_f.astype(np.int)
print(np.unique(y_f))
average_precision_score(feature_engineered_df['retweet_timestampe_indicator'][~mask], y_f)