# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote

#### Loading the data from the pickled version
1. importing libraries
2. checking gpu support


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))
filename = "data/vision_graph_data_local_16_09_22.pkl"
# filename = "data/vision_graph_data.pkl"
with open(filename, 'rb') as f:
    df = pickle.load(f)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
cuda gpu is available: True


2022-08-16 15:36:28.470498: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-16 15:36:28.518277: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 15:36:28.565084: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 15:36:28.565355: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [2]:
#import utility functions
%run author_vision_util.ipynb
df = normalize_timedelta(df)
df.head(2)

Unnamed: 0,timedelta,root_distance_0,y,current,beam_node,has_followed_path,has_follow_path,beam_node_author,platform,conversation_id,...,root_distance_14,root_distance_15,root_distance_16,root_distance_17,root_distance_18,root_distance_19,root_distance_20,root_distance_21,root_distance_22,root_distance_23
0,2.7e-05,0,0,1524481865415110657,1524461544284712960,0,0,1496816657750179843,twitter,1524450079444279297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2e-05,0,0,1524481865415110657,1524466876037935106,0,0,1239116711342768128,twitter,1524450079444279297,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Delete rows that are neither twitter or reddit data

In [3]:
# remove non-features
current = df.current
beam_node = df.beam_node
platform = df.platform
platform.value_counts()

# filtering data that is not twitter or reddit
to_delete_rows = platform.apply(lambda x: is_not_reddit_or_twitter(x))
df = df.drop(df[to_delete_rows].index)
df.platform.value_counts()

twitter    819079
reddit      24497
Name: platform, dtype: int64

#### Equalizing the sample sizes
- chose random samples from distinct conversation_ids
- increase sample size until data size is similar between reddit and twitter

In [4]:
# limit the twitter data and the reddit data to the same amount and prevent gpu problem
df = equalize_samples(df)
df.platform.value_counts()

chosen 17 conversations and gotten 25521 from twitter compared to 24497 from reddit


twitter    25521
reddit     24497
Name: platform, dtype: int64

In [5]:
# df = df[df["root_distance_0"] == 0]
# analyze the distribution of reached targets for the sample
print("reddit:")
print(df[df["platform"]=="reddit"].y.value_counts())
print("twitter:")
print(df[df["platform"]=="twitter"].y.value_counts())
# this should be higher for reddit as the unique author / posting ratio is lower for reddit

reddit:
0    21624
1     2873
Name: y, dtype: int64
twitter:
0    24170
1     1351
Name: y, dtype: int64


### Computing a nn model
1. seperate features
2. train models for reddit and twitter
3. inspect models for reddit and twitter
4. predict the likelihood based on the author has seen a posting
5. aggregate likelihoods in order to compute author vision measure


In [6]:

# training functions

def train_model(df):
    # dropping non-reddit non-twitter data
    df = take_features(df)


    # selecting train and test datasets
    train, test = train_test_split(df, test_size=0.2)
    train.describe()

    # train the model
    y = train.y
    x = train.drop("y", axis=1)
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model

    print(tf.__version__)
    input_shape = (x.shape[1],)
    model = Sequential([
        Dense(1, activation='sigmoid', input_shape=input_shape)
    ])

    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer='sgd',
        loss='binary_crossentropy',
        metrics=['accuracy', 'mae']
    )

    # model.fit(x, y, epochs=3)
    model.fit(x, y)
    # evaluate the model on the test set
    test_y = test.y
    test_x = test.drop("y", axis=1)

    loss, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

    return x, y, test_x, test_y, model


def inspect_model(x, y, test_x, test_y, model):
    # have a look at some prediction
    reply_distance_2 = test_x[test_x["reply_distance_2"] == 1]
    first_rows = reply_distance_2.head(2)
    print(first_rows)
    model.predict(first_rows)

    # let's have a look at the weights and biases of the hidden layer
    first_layer_weights = model.layers[0].get_weights()[0]
    first_layer_biases = model.layers[0].get_weights()[1]
    # print(first_layer_weights)
    column_names = x.columns.values
    for i in range(len(column_names[:5])):
        print("feature {} has weight {} \n".format(column_names[i], first_layer_weights[i]))


In [7]:
# have a look for reddit
tw_df = df[df["platform"] == "twitter"]
tw_x, tw_y, tw_test_x, tw_test_y, tw_model = train_model(tw_df)

(20416, 47)
(20416,)
2.6.0


2022-08-16 15:36:31.025505: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 15:36:31.025810: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 15:36:31.026012: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 15:36:31.026567: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-16 15:36:31.026770: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

the accuracy on the training set is 0.9492654204368591 and the mae is 0.20566444098949432


In [8]:
rd_df = df[df["platform"] == "reddit"]
rd_x, rd_y, rd_test_x, rd_test_y, rd_model = train_model(rd_df)

(19597, 47)
(19597,)
2.6.0
the accuracy on the training set is 0.881428599357605 and the mae is 0.2724708318710327


In [9]:
inspect_model(tw_x, tw_y, tw_test_x, tw_test_y, tw_model)
# inspect_model(rd_x, rd_y, rd_test_x, rd_test_y, rd_model)

        timedelta  root_distance_0  reply_distance_2  reply_distance_3  \
547230   0.000524                1               1.0               0.0   
413848   0.000117                1               1.0               0.0   

        reply_distance_4  root_distance_1  root_distance_2  root_distance_3  \
547230               0.0              0.0              0.0              0.0   
413848               0.0              0.0              0.0              0.0   

        root_distance_4  reply_distance_5  ...  root_distance_14  \
547230              0.0               0.0  ...               0.0   
413848              0.0               0.0  ...               0.0   

        root_distance_15  root_distance_16  root_distance_17  \
547230               0.0               0.0               0.0   
413848               0.0               0.0               0.0   

        root_distance_18  root_distance_19  root_distance_20  \
547230               0.0               0.0               0.0   
413848       

In [10]:
tw_non_features = take_non_features(tw_df)
rd_non_features = take_non_features(rd_df)
tw_non_features.head(1)

Unnamed: 0,current,beam_node,has_followed_path,has_follow_path,beam_node_author,platform,conversation_id
100204,1542906475164598273,1542207942316933121,0,0,1518688515076370435,twitter,1541157911363289090


In [11]:
tw_features_y = take_features(tw_df)
tw_features = tw_features_y.drop("y", axis=1)
rd_features_y = take_features(rd_df)
rd_features = rd_features_y.drop("y", axis=1)
rd_predictions = rd_model.predict(rd_features)
tw_predictions = tw_model.predict(tw_features)

tw_vision = tw_non_features.assign(predictions=tw_predictions)
rd_vision = rd_non_features.assign(predictions=rd_predictions)
#tw_vision


In [33]:
combined_vision = tw_vision.append(rd_vision)
combined_vision


not_needed_list = ["beam_node", "has_followed_path", "has_follow_path"]
combined_vision = combined_vision.drop(not_needed_list, axis=1)
combined_vision

Unnamed: 0,current,beam_node_author,platform,conversation_id,predictions
100204,1542906475164598273,1518688515076370435,twitter,1541157911363289090,0.187176
100205,1542906475164598273,1518702744693055488,twitter,1541157911363289090,0.187176
100206,1542906475164598273,371943155,twitter,1541157911363289090,0.187173
100207,1542906475164598273,263778058,twitter,1541157911363289090,0.187172
100208,1542906475164598273,1497318967823802371,twitter,1541157911363289090,0.187164
...,...,...,...,...,...
764716,27029110,24575560,reddit,57248369,0.156690
764717,27029110,72588799,reddit,57248369,0.183148
764718,27029110,62092993,reddit,57248369,0.280018
764719,27029110,72588799,reddit,57248369,0.282723


In [34]:
combined_vision= combined_vision.drop("beam_node_author", axis=1)
gpm = combined_vision.groupby(["platform", "conversation_id", "current"]).mean()
gpm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,predictions
platform,conversation_id,current,Unnamed: 3_level_1
reddit,661614,26871177,0.228854
reddit,661614,29352234,0.248579
reddit,661614,33390443,0.249401
reddit,661614,36457165,0.245291
reddit,661614,91649333,0.247346
...,...,...,...
twitter,1544119731850027008,1544219659033092096,0.157125
twitter,1552325920060542976,1552326922344054784,0.158403
twitter,1552325920060542976,1552374355992125442,0.149655
twitter,1552325920060542976,1552376400442695692,0.158186


In [35]:
gpm_per_conversation = gpm.groupby(by=["platform", "conversation_id"]).mean()
gpm_per_conversation.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,predictions
platform,conversation_id,Unnamed: 2_level_1
reddit,661614,0.243442
reddit,10955776,0.191684
reddit,15848916,0.242252
reddit,17286490,0.217674
reddit,19851732,0.194112


In [36]:
gpm_per_platform = gpm.groupby(by=["platform"]).mean()
gpm_per_platform


Unnamed: 0_level_0,predictions
platform,Unnamed: 1_level_1
reddit,0.19884
twitter,0.169384


In [40]:
%run author_vision_data_analysis.ipynb
probabilities = repetition_probability.join(gpm_per_platform)
#probabilities = gpm_per_platform.drop("delab")
probabilities

Unnamed: 0_level_0,repetition_probs,predictions
platform,Unnamed: 1_level_1,Unnamed: 2_level_1
delab,0.219789,
reddit,0.229424,0.19884
twitter,0.096224,0.169384


In [38]:
probabilities.corr()


Unnamed: 0,repetition_probs,predictions
repetition_probs,1.0,1.0
predictions,1.0,1.0


### Interpretation of the combined results
- This means that the neural network computes a linear function of the repetition probabilities based on the computation of the y functions
- The probabilities are very low for both reddit and twitter but in a comparable area
