# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote

#### Loading the data from the pickled version
1. importing libraries
2. checking gpu support


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))
with open("data/vision_graph_data.pkl", 'rb') as f:
    df = pickle.load(f)

cuda gpu is available: True


2022-08-03 11:48:24.885057: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 11:48:24.885415: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 11:48:24.885647: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 11:48:24.885948: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 11:48:24.886182: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [23]:
def is_not_reddit_or_twitter(text):
    if text == "reddit" or text == "twitter":
        return False
    else:
        return True

# remove non-features
current = df.current
beam_node = df.beam_node
platform = df.platform
platform.value_counts()

# filtering data that is not twitter or reddit
to_delete_rows = platform.apply(lambda x: is_not_reddit_or_twitter(x))
df = df.drop(df[to_delete_rows].index)
df.describe()

Unnamed: 0,timedelta,root_distance_0,y,current,beam_node,has_followed_path,has_follow_path,conversation_id,reply_distance_2,reply_distance_3,...,reply_distance_40,root_distance_32,root_distance_33,root_distance_34,root_distance_35,root_distance_36,root_distance_37,root_distance_38,root_distance_39,root_distance_40
count,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,...,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0
mean,42742.85,0.03969987,0.05875656,1.296209e+18,1.296008e+18,0.0,0.0,1.295209e+18,0.006031525,0.003544929,...,8.773128e-07,5.848752e-06,5.263877e-06,4.679001e-06,4.094126e-06,2.047063e-06,1.754626e-06,1.462188e-06,1.16975e-06,8.773128e-07
std,1113152.0,0.1952532,0.2351685,5.464027e+17,5.464532e+17,0.0,0.0,5.464034e+17,0.07742834,0.05943369,...,0.0009366495,0.002418413,0.002294308,0.002163095,0.002023391,0.001430755,0.001324622,0.001209209,0.00108155,0.0009366495
min,1.0,0.0,0.0,388.0,388.0,0.0,0.0,174503.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2417.0,0.0,0.0,1.493867e+18,1.493522e+18,0.0,0.0,1.490282e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9486.0,0.0,0.0,1.53178e+18,1.531699e+18,0.0,0.0,1.531571e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32393.0,0.0,0.0,1.539142e+18,1.539004e+18,0.0,0.0,1.53889e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,214871500.0,1.0,1.0,1.552563e+18,1.552563e+18,0.0,0.0,1.552559e+18,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
from random import sample

# limit the twitter data and the reddit data to the same amount and prevent gpu problem
df_conversations = df[df["platform"]=="twitter"].conversation_id.tolist()
reddit_data_count = df.loc[df.platform == "reddit", 'platform'].count()
twitter_data_count = df.loc[df.platform == "twitter", 'platform'].count()
assert twitter_data_count > reddit_data_count

current_count = 0
n = 1
while current_count < reddit_data_count:
    chosen_conversation_ids = sample(df_conversations, n)
    df_candidate = df[df["conversation_id"].isin(chosen_conversation_ids)]
    n = n + 1
    current_count = df_candidate.shape[0]

print("chosen {} conversations and gotten {} from twitter compared to {} from reddit".format(n, current_count, reddit_data_count))
not_chosen_conversation_ids  = set(df_conversations) - set(chosen_conversation_ids)
df = df[~df["conversation_id"].isin(not_chosen_conversation_ids)]
df.describe()

chosen 242 conversations and gotten 522223 from twitter compared to 514793 from reddit


Unnamed: 0,timedelta,root_distance_0,y,current,beam_node,has_followed_path,has_follow_path,conversation_id,reply_distance_2,reply_distance_3,...,reply_distance_40,root_distance_32,root_distance_33,root_distance_34,root_distance_35,root_distance_36,root_distance_37,root_distance_38,root_distance_39,root_distance_40
count,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,...,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0,1037016.0
mean,69951.13,0.037065,0.07632669,7.692708e+17,7.691746e+17,0.0,0.0,7.670094e+17,0.01332766,0.007574618,...,2.892916e-06,1.060736e-05,9.643053e-06,8.678747e-06,7.714442e-06,6.750137e-06,5.785832e-06,4.821526e-06,3.857221e-06,2.892916e-06
std,1574565.0,0.1889212,0.2655202,7.639554e+17,7.638775e+17,0.0,0.0,7.624943e+17,0.1146737,0.08670208,...,0.001700856,0.003256878,0.003105313,0.00294596,0.002777479,0.002598095,0.00240537,0.002195793,0.001963978,0.001700856
min,1.0,0.0,0.0,388.0,388.0,0.0,0.0,174503.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3532.0,0.0,0.0,50044660.0,50301670.0,0.0,0.0,51128140.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,13002.0,0.0,0.0,1.455657e+18,1.455632e+18,0.0,0.0,9.625765e+17,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,37992.0,0.0,0.0,1.533731e+18,1.533575e+18,0.0,0.0,1.533426e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,150516400.0,1.0,1.0,1.552559e+18,1.552544e+18,0.0,0.0,1.552169e+18,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:

# some utility functions to take the columns that are used as features
non_feature_list = ["current", "beam_node", "conversation_id", "platform", "has_followed_path", "has_follow_path"]
def take_features(df):
    df = df.drop(non_feature_list, axis=1)
    return df

def take_non_features(df):
    column_names = df.columns.values
    feature_list = [column_name for column_name in column_names if column_name not in non_feature_list]
    df = df.drop(feature_list, axis=1)
    return df

In [26]:

# training functions

def train_model(df):
    # dropping non-reddit non-twitter data
    df = take_features(df)
    # df = df[df["root_distance_0"] == 0]

    # normalize timedelta (put between 0 and 1)
    dt = df.timedelta
    timedelta_normalized = (dt - dt.min()) / (dt.max() - dt.min())
    df.timedelta = timedelta_normalized

    # selecting train and test datasets
    train, test = train_test_split(df, test_size=0.2)
    train.describe()

    # train the model
    y = train.y
    x = train.drop("y", axis=1)
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model

    print(tf.__version__)
    input_shape = (x.shape[1],)
    model = Sequential([
        Dense(1, activation='sigmoid', input_shape=input_shape)
    ])

    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer='sgd',
        loss='binary_crossentropy',
        metrics=['accuracy', 'mae']
    )

    # model.fit(x, y, epochs=3)
    model.fit(x, y)
    # evaluate the model on the test set
    test_y = test.y
    test_x = test.drop("y", axis=1)

    loss, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

    return x, y, test_x, test_y, model


def inspect_model(x, y, test_x, test_y, model):
    # have a look at some prediction
    # reply_distance_2 = test_x[test_x["reply_distance_2"] == 1]
    # first_rows = reply_distance_2.head(2)
    # print(first_rows)
    # model.predict(first_rows)

    # let's have a look at the weights and biases of the hidden layer
    first_layer_weights = model.layers[0].get_weights()[0]
    first_layer_biases = model.layers[0].get_weights()[1]
    # print(first_layer_weights)
    column_names = x.columns.values
    for i in range(len(column_names[:5])):
        print("feature {} has weight {} \n".format(column_names[i], first_layer_weights[i]))


In [27]:
# have a look for reddit
tw_df = df[df["platform"] == "twitter"]
tw_x, tw_y, tw_test_x, tw_test_y, tw_model = train_model(tw_df)

(417778, 81)
(417778,)
2.6.0
the accuracy on the training set is 0.9656087160110474 and the mae is 0.05326465517282486


In [28]:
rd_df = df[df["platform"] == "reddit"]
rd_x, rd_y, rd_test_x, rd_test_y, rd_model = train_model(rd_df)

(411834, 81)
(411834,)
2.6.0
the accuracy on the training set is 0.8809525966644287 and the mae is 0.18972541391849518


In [29]:
# inspect_model(tw_x, tw_y, tw_test_x, tw_test_y, tw_model)
# inspect_model(rd_x, rd_y, rd_test_x, rd_test_y, rd_model)

In [30]:
# rd_sample_size = rd_df.shape[0]
# tw_df = tw_df.sample(n=rd_sample_size)


tw_non_features = take_non_features(tw_df)
rd_non_features = take_non_features(rd_df)
tw_non_features

Unnamed: 0,current,beam_node,has_followed_path,has_follow_path,platform,conversation_id
105,1527614293239382016,1527522295354368005,0,0,twitter,1527522295354368005
106,1527614293239382016,1527613879110574081,0,0,twitter,1527522295354368005
107,1527614293239382016,1527612414765801473,0,0,twitter,1527522295354368005
108,1527614293239382016,1527612113010802689,0,0,twitter,1527522295354368005
109,1527614293239382016,1527573751172415488,0,0,twitter,1527522295354368005
...,...,...,...,...,...,...
3419612,1543869722831331329,1543865562958155776,0,0,twitter,1543814358718992388
3419613,1543869722831331329,1543858541068521478,0,0,twitter,1543814358718992388
3419614,1543869722831331329,1543863860259536898,0,0,twitter,1543814358718992388
3419615,1543869722831331329,1543859236878577666,0,0,twitter,1543814358718992388


In [31]:

from util.abusing_lists import batch

tw_features_y = take_features(tw_df)
tw_features = tw_features_y.drop("y", axis=1)
rd_features_y = take_features(rd_df)
rd_features = rd_features_y.drop("y", axis=1)
rd_predictions = rd_model.predict(rd_features)
tw_predictions = tw_model.predict(tw_features)

tw_vision = tw_non_features.assign(predictions=tw_predictions)
rd_vision = rd_non_features.assign(predictions=rd_predictions)
#tw_vision


In [32]:
combined_vision = tw_vision.append(rd_vision)
combined_vision


not_needed_list = ["beam_node", "has_followed_path", "has_follow_path"]
combined_vision = combined_vision.drop(not_needed_list, axis=1)
combined_vision

Unnamed: 0,current,platform,conversation_id,predictions
105,1527614293239382016,twitter,1527522295354368005,1.0
106,1527614293239382016,twitter,1527522295354368005,1.0
107,1527614293239382016,twitter,1527522295354368005,1.0
108,1527614293239382016,twitter,1527522295354368005,1.0
109,1527614293239382016,twitter,1527522295354368005,1.0
...,...,...,...,...
3418060,68851107,reddit,12994552,1.0
3418061,68851107,reddit,12994552,1.0
3418062,68851107,reddit,12994552,1.0
3418063,68851107,reddit,12994552,1.0


In [33]:
gpm = combined_vision.groupby(["platform", "conversation_id", "current"]).mean()
gpm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,predictions
platform,conversation_id,current,Unnamed: 3_level_1
reddit,174503,4541493,0.987631
reddit,174503,5662402,1.000000
reddit,174503,8711684,0.926990
reddit,174503,10143803,1.000000
reddit,174503,11735997,0.979709
...,...,...,...
twitter,1552168511165894657,1552397547590795264,1.000000
twitter,1552168511165894657,1552413538924249090,1.000000
twitter,1552168511165894657,1552421341940387844,1.000000
twitter,1552168511165894657,1552544216697430017,1.000000


In [34]:
gpm_per_conversation = gpm.groupby(by=["platform", "conversation_id"]).mean()
gpm_per_conversation

Unnamed: 0_level_0,Unnamed: 1_level_0,predictions
platform,conversation_id,Unnamed: 2_level_1
reddit,174503,0.984993
reddit,203904,0.977508
reddit,209098,0.990180
reddit,313699,0.973880
reddit,471878,0.957565
...,...,...
twitter,1544585020161101824,0.986020
twitter,1545721821647060993,0.998108
twitter,1551481579784085504,0.999732
twitter,1552024342661251072,0.995596


In [35]:
gpm_per_platform = gpm.groupby(by=["platform"]).mean()
gpm_per_platform



Unnamed: 0_level_0,predictions
platform,Unnamed: 1_level_1
reddit,0.968935
twitter,0.990445


In [36]:
combined_vision = combined_vision.drop(["predictions"], axis=1)


In [37]:
author_count_columns = ["current", "conversation_id", "platform", "y"]
author_df = df[author_count_columns]
author_df

Unnamed: 0,current,conversation_id,platform,y
105,1527614293239382016,1527522295354368005,twitter,1
106,1527614293239382016,1527522295354368005,twitter,0
107,1527614293239382016,1527522295354368005,twitter,0
108,1527614293239382016,1527522295354368005,twitter,0
109,1527614293239382016,1527522295354368005,twitter,0
...,...,...,...,...
3419612,1543869722831331329,1543814358718992388,twitter,0
3419613,1543869722831331329,1543814358718992388,twitter,1
3419614,1543869722831331329,1543814358718992388,twitter,0
3419615,1543869722831331329,1543814358718992388,twitter,0


In [38]:
author_df = author_df.groupby(["platform", "conversation_id", "current"]).sum()
author_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,y
platform,conversation_id,current,Unnamed: 3_level_1
reddit,174503,4541493,2
reddit,174503,5662402,5
reddit,174503,8711684,4
reddit,174503,10143803,2
reddit,174503,11735997,11
...,...,...,...
twitter,1552168511165894657,1552397547590795264,1
twitter,1552168511165894657,1552413538924249090,1
twitter,1552168511165894657,1552421341940387844,1
twitter,1552168511165894657,1552544216697430017,1


In [39]:
author_df = author_df.groupby(["platform", "conversation_id"]).mean()
author_df


Unnamed: 0_level_0,Unnamed: 1_level_0,y
platform,conversation_id,Unnamed: 2_level_1
reddit,174503,4.244444
reddit,203904,2.810811
reddit,209098,1.875000
reddit,313699,1.230769
reddit,471878,3.285714
...,...,...
twitter,1544585020161101824,4.950820
twitter,1545721821647060993,1.217949
twitter,1551481579784085504,2.360000
twitter,1552024342661251072,4.300000


In [40]:
distinct_view_columns = ["current", "conversation_id", "platform"]
distinct_view_df = df[distinct_view_columns]
distinct_view_df

Unnamed: 0,current,conversation_id,platform
105,1527614293239382016,1527522295354368005,twitter
106,1527614293239382016,1527522295354368005,twitter
107,1527614293239382016,1527522295354368005,twitter
108,1527614293239382016,1527522295354368005,twitter
109,1527614293239382016,1527522295354368005,twitter
...,...,...,...
3419612,1543869722831331329,1543814358718992388,twitter
3419613,1543869722831331329,1543814358718992388,twitter
3419614,1543869722831331329,1543814358718992388,twitter
3419615,1543869722831331329,1543814358718992388,twitter


In [41]:
distinct_views = distinct_view_df.groupby(["current", "conversation_id", "platform"]).size().to_frame('size')
distinct_views = distinct_views.groupby(["platform", "conversation_id"]).mean()
distinct_views


Unnamed: 0_level_0,Unnamed: 1_level_0,size
platform,conversation_id,Unnamed: 2_level_1
reddit,174503,23.0
reddit,203904,19.0
reddit,209098,8.5
reddit,313699,7.0
reddit,471878,11.0
...,...,...
twitter,1544585020161101824,32.0
twitter,1545721821647060993,39.5
twitter,1551481579784085504,25.5
twitter,1552024342661251072,45.5


In [42]:
joined_author_stats = author_df.join(distinct_views)
joined_author_stats = joined_author_stats["y"]/joined_author_stats["size"]
joined_author_stats

platform  conversation_id    
reddit    174503                 0.184541
          203904                 0.147937
          209098                 0.220588
          313699                 0.175824
          471878                 0.298701
                                   ...   
twitter   1544585020161101824    0.154713
          1545721821647060993    0.030834
          1551481579784085504    0.092549
          1552024342661251072    0.094505
          1552168511165894657    0.044622
Length: 1301, dtype: float64

In [43]:
repetition_probability = joined_author_stats.groupby("platform").mean()
repetition_probability

platform
reddit     0.229424
twitter    0.046573
dtype: float64