# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote

#### Loading the data from the pickled version
1. importing libraries
2. checking gpu support


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))
with open("data/vision_graph_data.pkl", 'rb') as f:
    df = pickle.load(f)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
cuda gpu is available: True


2022-08-03 14:10:22.175201: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-03 14:10:22.216131: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:10:22.247830: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:10:22.248180: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

### Delete rows that are neither twitter or reddit data

In [2]:
def is_not_reddit_or_twitter(text):
    if text == "reddit" or text == "twitter":
        return False
    else:
        return True

# remove non-features
current = df.current
beam_node = df.beam_node
platform = df.platform
platform.value_counts()

# filtering data that is not twitter or reddit
to_delete_rows = platform.apply(lambda x: is_not_reddit_or_twitter(x))
df = df.drop(df[to_delete_rows].index)
df.platform.value_counts()

twitter    2904740
reddit      514793
Name: platform, dtype: int64

#### Equalizing the sample sizes
- chose random samples from distinct conversation_ids
- increase sample size until data size is similar between reddit and twitter

In [3]:
from random import sample

# limit the twitter data and the reddit data to the same amount and prevent gpu problem
df_conversations = set(df[df["platform"]=="twitter"].conversation_id.tolist())
reddit_data_count = df.loc[df.platform == "reddit", 'platform'].count()
twitter_data_count = df.loc[df.platform == "twitter", 'platform'].count()
assert twitter_data_count > reddit_data_count

current_count = 0
n = 1
while current_count < reddit_data_count:
    chosen_conversation_ids = sample(df_conversations, n)
    df_candidate = df[df["conversation_id"].isin(chosen_conversation_ids)]
    n = n + 1
    current_count = df_candidate.shape[0]

print("chosen {} conversations and gotten {} from twitter compared to {} from reddit".format(n, current_count, reddit_data_count))
not_chosen_conversation_ids  = set(df_conversations) - set(chosen_conversation_ids)
df = df[~df["conversation_id"].isin(not_chosen_conversation_ids)]
df.platform.value_counts()

chosen 525 conversations and gotten 541793 from twitter compared to 514793 from reddit


twitter    541793
reddit     514793
Name: platform, dtype: int64

In [4]:
# df = df[df["root_distance_0"] == 0]
# analyze the distribution of reached targets for the sample
print("reddit:")
print(df[df["platform"]=="reddit"].y.value_counts())
print("twitter:")
print(df[df["platform"]=="twitter"].y.value_counts())
# this should be higher for reddit as the unique author / posting ratio is lower for reddit





reddit:
0    453046
1     61747
Name: y, dtype: int64
twitter:
0    517251
1     24542
Name: y, dtype: int64


### Computing a nn model
1. seperate features
2. train models for reddit and twitter
3. inspect models for reddit and twitter
4. predict the likelihood based on the author has seen a posting
5. aggregate likelihoods in order to compute author vision measure

In [5]:
# some utility functions to take the columns that are used as features
non_feature_list = ["current", "beam_node", "conversation_id", "platform", "has_followed_path", "has_follow_path"]
def take_features(df):
    df = df.drop(non_feature_list, axis=1)
    return df

def take_non_features(df):
    column_names = df.columns.values
    feature_list = [column_name for column_name in column_names if column_name not in non_feature_list]
    df = df.drop(feature_list, axis=1)
    return df

In [6]:

# training functions

def train_model(df):
    # dropping non-reddit non-twitter data
    df = take_features(df)

    # normalize timedelta (put between 0 and 1)
    dt = df.timedelta
    timedelta_normalized = (dt - dt.min()) / (dt.max() - dt.min())
    df.timedelta = timedelta_normalized

    # selecting train and test datasets
    train, test = train_test_split(df, test_size=0.2)
    train.describe()

    # train the model
    y = train.y
    x = train.drop("y", axis=1)
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model

    print(tf.__version__)
    input_shape = (x.shape[1],)
    model = Sequential([
        Dense(1, activation='sigmoid', input_shape=input_shape)
    ])

    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer='sgd',
        loss='binary_crossentropy',
        metrics=['accuracy', 'mae']
    )

    # model.fit(x, y, epochs=3)
    model.fit(x, y)
    # evaluate the model on the test set
    test_y = test.y
    test_x = test.drop("y", axis=1)

    loss, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

    return x, y, test_x, test_y, model


def inspect_model(x, y, test_x, test_y, model):
    # have a look at some prediction
    reply_distance_2 = test_x[test_x["reply_distance_2"] == 1]
    first_rows = reply_distance_2.head(2)
    print(first_rows)
    model.predict(first_rows)

    # let's have a look at the weights and biases of the hidden layer
    first_layer_weights = model.layers[0].get_weights()[0]
    first_layer_biases = model.layers[0].get_weights()[1]
    # print(first_layer_weights)
    column_names = x.columns.values
    for i in range(len(column_names[:5])):
        print("feature {} has weight {} \n".format(column_names[i], first_layer_weights[i]))


In [7]:
# have a look for reddit
tw_df = df[df["platform"] == "twitter"]
tw_x, tw_y, tw_test_x, tw_test_y, tw_model = train_model(tw_df)

(433434, 81)
(433434,)
2.6.0


2022-08-03 14:11:08.860038: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:11:08.860390: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:11:08.860652: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:11:08.861093: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:11:08.861375: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

the accuracy on the training set is 0.9541524052619934 and the mae is 0.0612359456717968


In [8]:
rd_df = df[df["platform"] == "reddit"]
rd_x, rd_y, rd_test_x, rd_test_y, rd_model = train_model(rd_df)

(411834, 81)
(411834,)
2.6.0


2022-08-03 14:11:34.445550: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 266868432 exceeds 10% of free system memory.
2022-08-03 14:11:34.654839: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 266868432 exceeds 10% of free system memory.


the accuracy on the training set is 0.8810886144638062 and the mae is 0.19073688983917236


In [9]:
inspect_model(tw_x, tw_y, tw_test_x, tw_test_y, tw_model)
# inspect_model(rd_x, rd_y, rd_test_x, rd_test_y, rd_model)

         timedelta  root_distance_0  reply_distance_2  reply_distance_3  \
2046952   0.000045                0               1.0               0.0   
2047122   0.000352                0               1.0               0.0   

         reply_distance_4  root_distance_1  root_distance_2  root_distance_3  \
2046952               0.0              0.0              0.0              0.0   
2047122               0.0              0.0              0.0              0.0   

         root_distance_4  reply_distance_5  ...  reply_distance_40  \
2046952              0.0               0.0  ...                0.0   
2047122              0.0               0.0  ...                0.0   

         root_distance_32  root_distance_33  root_distance_34  \
2046952               0.0               0.0               0.0   
2047122               0.0               0.0               0.0   

         root_distance_35  root_distance_36  root_distance_37  \
2046952               0.0               0.0               0.0

In [10]:
tw_non_features = take_non_features(tw_df)
rd_non_features = take_non_features(rd_df)
tw_non_features.head(1)

Unnamed: 0,current,beam_node,has_followed_path,has_follow_path,platform,conversation_id
105,1527614293239382016,1527522295354368005,0,0,twitter,1527522295354368005


In [11]:
tw_features_y = take_features(tw_df)
tw_features = tw_features_y.drop("y", axis=1)
rd_features_y = take_features(rd_df)
rd_features = rd_features_y.drop("y", axis=1)
rd_predictions = rd_model.predict(rd_features)
tw_predictions = tw_model.predict(tw_features)

tw_vision = tw_non_features.assign(predictions=tw_predictions)
rd_vision = rd_non_features.assign(predictions=rd_predictions)
#tw_vision


2022-08-03 14:11:56.525332: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 333585864 exceeds 10% of free system memory.


In [12]:
combined_vision = tw_vision.append(rd_vision)
combined_vision


not_needed_list = ["beam_node", "has_followed_path", "has_follow_path"]
combined_vision = combined_vision.drop(not_needed_list, axis=1)
combined_vision

Unnamed: 0,current,platform,conversation_id,predictions
105,1527614293239382016,twitter,1527522295354368005,0.000000e+00
106,1527614293239382016,twitter,1527522295354368005,5.923356e-09
107,1527614293239382016,twitter,1527522295354368005,4.858867e-33
108,1527614293239382016,twitter,1527522295354368005,5.217902e-38
109,1527614293239382016,twitter,1527522295354368005,0.000000e+00
...,...,...,...,...
3418060,68851107,reddit,12994552,0.000000e+00
3418061,68851107,reddit,12994552,0.000000e+00
3418062,68851107,reddit,12994552,0.000000e+00
3418063,68851107,reddit,12994552,0.000000e+00


In [67]:
gpm = combined_vision.groupby(["platform", "conversation_id", "current"]).mean()
gpm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,predictions
platform,conversation_id,current,Unnamed: 3_level_1
reddit,174503,4541493,4.581887e-25
reddit,174503,5662402,0.000000e+00
reddit,174503,8711684,1.636770e-11
reddit,174503,10143803,0.000000e+00
reddit,174503,11735997,3.229613e-06
...,...,...,...
twitter,1552016187491500033,1552024890798182402,1.405391e-31
twitter,1552016187491500033,1552050615181787142,0.000000e+00
twitter,1552016187491500033,1552050898234482688,7.815641e-08
twitter,1552016187491500033,1552051394617786368,2.056405e-11


In [68]:
gpm_per_conversation = gpm.groupby(by=["platform", "conversation_id"]).mean()
gpm_per_conversation

Unnamed: 0_level_0,Unnamed: 1_level_0,predictions
platform,conversation_id,Unnamed: 2_level_1
reddit,174503,6.429478e-05
reddit,203904,2.010869e-04
reddit,209098,3.360211e-09
reddit,313699,6.280348e-09
reddit,471878,1.981144e-02
...,...,...
twitter,1551870406843420675,4.214541e-04
twitter,1551886927137218561,1.100554e-04
twitter,1551945921247731715,1.375426e-14
twitter,1551947945334870016,3.450395e-05


In [69]:
gpm_per_platform = gpm.groupby(by=["platform"]).mean()
gpm_per_platform


Unnamed: 0_level_0,predictions
platform,Unnamed: 1_level_1
reddit,0.00174
twitter,0.001315


In [70]:
%run author_vision_data_analysis.ipynb
gpm_per_platform = repetition_probability.join(gpm_per_platform)
gpm_per_platform

Unnamed: 0_level_0,repetition_probs,predictions
platform,Unnamed: 1_level_1,Unnamed: 2_level_1
delab,0.219789,
reddit,0.229424,0.00174
twitter,0.096224,0.001315


In [76]:
probabilities = gpm_per_platform.drop("delab")
probabilities.corr()


Unnamed: 0,repetition_probs,predictions
repetition_probs,1.0,1.0
predictions,1.0,1.0


### Interpretation of the combined results
- This means that the neural network computes a linear function of the repetition probabilities based on the computation of the y functions
- The probabilities are very low for both reddit and twitter but in a comparable area
