# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote

#### Loading the data from the pickled version
1. importing libraries
2. checking gpu support


In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))
filename = "data/vision_graph_data_local_22_08_22.pkl"
# filename = "data/vision_graph_data_remote_23_08_22.pkl"
with open(filename, 'rb') as f:
    df = pickle.load(f)

cuda gpu is available: True


2022-08-22 17:05:46.224629: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-22 17:05:46.225605: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-22 17:05:46.225898: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-22 17:05:46.230451: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-22 17:05:46.230765: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [103]:
#import utility functions
%run author_vision_util.ipynb
df = normalize_timedelta(df)
df.head(2)

Unnamed: 0,reply_distance_2,reply_distance_3,reply_distance_4,reply_distance_5,reply_distance_6,reply_distance_7,reply_distance_8,reply_distance_9,timedelta,root_distance_0,...,root_distance_14,root_distance_15,root_distance_16,root_distance_17,root_distance_18,root_distance_19,root_distance_20,root_distance_21,root_distance_22,root_distance_23
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000208,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000198,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Delete rows that are neither twitter or reddit data

In [104]:
# filtering data that is not twitter or reddit
platform = df.platform
to_delete_rows = platform.apply(lambda x: is_not_reddit_or_twitter(x))
df = df.drop(df[to_delete_rows].index)



df.platform.value_counts()

twitter    819079
reddit      33116
Name: platform, dtype: int64

#### Equalizing the sample sizes
- chose random samples from distinct conversation_ids
- increase sample size until data size is similar between reddit and twitter

In [105]:
# limit the twitter data and the reddit data to the same amount and prevent gpu problem
df = equalize_samples(df)
df.platform.value_counts()

chosen 42 conversations and gotten 33766 from twitter compared to 33116 from reddit


twitter    33766
reddit     33116
Name: platform, dtype: int64

In [106]:
# df = df[df["root_distance_0"] == 0]
# analyze the distribution of reached targets for the sample
print("reddit:")
print(df[df["platform"]=="reddit"].y.value_counts())
print("twitter:")
print(df[df["platform"]=="twitter"].y.value_counts())
# this should be higher for reddit as the unique author / posting ratio is lower for reddit

reddit:
0    28961
1     4155
Name: y, dtype: int64
twitter:
0    31245
1     2521
Name: y, dtype: int64


### Computing a nn model
1. seperate features
2. train models for reddit and twitter
3. inspect models for reddit and twitter
4. predict the likelihood based on the author has seen a posting
5. aggregate likelihoods in order to compute author vision measure


In [107]:

# training functions

def train_model(df):
    # dropping non-reddit non-twitter data
    df = take_features(df)


    # selecting train and test datasets
    train, test = train_test_split(df, test_size=0.2)
    train.describe()

    # train the model
    y = train.y
    x = train.drop("y", axis=1)
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model

    print(tf.__version__)
    input_shape = (x.shape[1],)
    model = Sequential([
        Dense(1, activation='sigmoid', input_shape=input_shape)
    ])

    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer='sgd',
        loss='binary_crossentropy',
        metrics=['accuracy', 'mae']
    )

    # model.fit(x, y, epochs=3)
    model.fit(x, y)
    # evaluate the model on the test set
    test_y = test.y
    test_x = test.drop("y", axis=1)

    loss, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

    return x, y, test_x, test_y, model


def inspect_model(x, y, test_x, test_y, model):
    # have a look at some prediction
    reply_distance_2 = test_x[test_x["reply_distance_2"] == 1]
    first_rows = reply_distance_2.head(2)
    print(first_rows)
    model.predict(first_rows)

    # let's have a look at the weights and biases of the hidden layer
    first_layer_weights = model.layers[0].get_weights()[0]
    first_layer_biases = model.layers[0].get_weights()[1]
    # print(first_layer_weights)
    column_names = x.columns.values
    for i in range(len(column_names[:5])):
        print("feature {} has weight {} \n".format(column_names[i], first_layer_weights[i]))


In [108]:
# have a look for reddit
tw_df = df[df["platform"] == "twitter"]
tw_x, tw_y, tw_test_x, tw_test_y, tw_model = train_model(tw_df)

(27012, 47)
(27012,)
2.6.0
the accuracy on the training set is 0.9212318658828735 and the mae is 0.20631590485572815


In [109]:
rd_df = df[df["platform"] == "reddit"]
rd_x, rd_y, rd_test_x, rd_test_y, rd_model = train_model(rd_df)

(26492, 47)
(26492,)
2.6.0
the accuracy on the training set is 0.8796799778938293 and the mae is 0.25726357102394104


In [110]:
inspect_model(tw_x, tw_y, tw_test_x, tw_test_y, tw_model)
# inspect_model(rd_x, rd_y, rd_test_x, rd_test_y, rd_model)

        reply_distance_2  reply_distance_3  reply_distance_4  \
191447               1.0               0.0               0.0   
120442               1.0               0.0               0.0   

        reply_distance_5  reply_distance_6  reply_distance_7  \
191447               0.0               0.0               0.0   
120442               0.0               0.0               0.0   

        reply_distance_8  reply_distance_9     timedelta  root_distance_0  \
191447               0.0               0.0  6.389792e-07                0   
120442               0.0               0.0  3.082234e-05                1   

        ...  root_distance_14  root_distance_15  root_distance_16  \
191447  ...               0.0               0.0               0.0   
120442  ...               0.0               0.0               0.0   

        root_distance_17  root_distance_18  root_distance_19  \
191447               0.0               0.0               0.0   
120442               0.0               0.0    

In [111]:
tw_non_features = take_non_features(tw_df)
rd_non_features = take_non_features(rd_df)
tw_non_features.head(1)

Unnamed: 0,current,beam_node,has_followed_path,has_follow_path,beam_node_author,platform,conversation_id,author
68382,1543366942911860742,1543366820748439552,0,0,1405333368330330120,twitter,1543366488710692864,3452233032


In [112]:
tw_features_y = take_features(tw_df)
tw_features = tw_features_y.drop("y", axis=1)
rd_features_y = take_features(rd_df)
rd_features = rd_features_y.drop("y", axis=1)
rd_predictions = rd_model.predict(rd_features)
tw_predictions = tw_model.predict(tw_features)

tw_vision = tw_non_features.assign(predictions=tw_predictions)
rd_vision = rd_non_features.assign(predictions=rd_predictions)
#tw_vision


In [113]:
combined_vision = tw_vision.append(rd_vision)
combined_vision


not_needed_list = ["beam_node_author", "beam_node", "has_followed_path", "has_follow_path"]
combined_vision = combined_vision.drop(not_needed_list, axis=1)
combined_vision_with_author = combined_vision
combined_vision

Unnamed: 0,current,platform,conversation_id,author,predictions
68382,1543366942911860742,twitter,1543366488710692864,3452233032,0.129005
68383,1543366942911860742,twitter,1543366488710692864,3452233032,0.129005
68384,1543366942911860742,twitter,1543366488710692864,3452233032,0.226200
68385,1543367781328060417,twitter,1543366488710692864,1440338610457231367,0.129005
68386,1543367781328060417,twitter,1543366488710692864,1440338610457231367,0.129005
...,...,...,...,...,...
841449,8688551,reddit,37562638,79294672,0.156682
841450,8688551,reddit,37562638,79294672,0.156683
841451,8688551,reddit,37562638,79294672,0.156683
841452,8688551,reddit,37562638,79294672,0.156693


In [114]:
combined_vision_with_author2 = combined_vision_with_author.groupby(["platform", "conversation_id", "author", "predictions"]).count()
combined_vision_with_author2 = combined_vision_with_author2.reset_index()
combined_vision_with_author2.groupby(["platform", "conversation_id", "author"]).sum()
combined_vision_with_author2["avg_predictions"] = combined_vision_with_author2.predictions /  combined_vision_with_author2.current
combined_vision_with_author2 = combined_vision_with_author2.drop(["current","predictions"], axis=1)
combined_vision_with_author2 = combined_vision_with_author2.groupby(["platform", "conversation_id", "author"]).mean()
combined_vision_with_author2 = combined_vision_with_author2.groupby(["platform", "conversation_id"]).mean()
combined_vision_with_author2 = combined_vision_with_author2.groupby(["platform"]).mean()
combined_vision_with_author2

Unnamed: 0_level_0,avg_predictions
platform,Unnamed: 1_level_1
reddit,0.182177
twitter,0.145531


In [115]:
combined_vision= combined_vision.drop("author", axis=1)
gpm = combined_vision.groupby(["platform", "conversation_id", "current"]).mean()
gpm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,predictions
platform,conversation_id,current,Unnamed: 3_level_1
reddit,661614,26871177,0.248612
reddit,661614,29352234,0.226013
reddit,661614,33390443,0.225070
reddit,661614,36457165,0.229780
reddit,661614,91649333,0.227426
...,...,...,...
twitter,1552736204034310149,1552936199614091267,0.163158
twitter,1552736204034310149,1552938080516071429,0.163515
twitter,1552736204034310149,1552940031647555585,0.163022
twitter,1552736204034310149,1552940657089695747,0.162220


In [116]:
gpm_per_conversation = gpm.groupby(by=["platform", "conversation_id"]).mean()
gpm_per_conversation.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,predictions
platform,conversation_id,Unnamed: 2_level_1
reddit,661614,0.231898
reddit,10955776,0.178053
reddit,14940435,0.184992
reddit,15848916,0.23326
reddit,17286490,0.195049


In [117]:
gpm_per_platform = gpm.groupby(by=["platform"]).mean()
gpm_per_platform


Unnamed: 0_level_0,predictions
platform,Unnamed: 1_level_1
reddit,0.185593
twitter,0.161793


In [118]:
%run author_vision_data_analysis.ipynb
probabilities = repetition_probability.join(gpm_per_platform)
#probabilities = gpm_per_platform.drop("delab")
probabilities

Unnamed: 0_level_0,repetition_probs,predictions
platform,Unnamed: 1_level_1,Unnamed: 2_level_1
delab,0.219789,
reddit,0.229424,0.185593
twitter,0.096224,0.161793


In [119]:
probabilities.corr()


Unnamed: 0,repetition_probs,predictions
repetition_probs,1.0,1.0
predictions,1.0,1.0


### Interpretation of the combined results
- This means that the neural network computes a linear function of the repetition probabilities based on the computation of the y functions
- The probabilities are very low for both reddit and twitter but in a comparable area
