# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)


print("cuda gpu is available: {}".format(is_cuda_gpu_available))

with open("data/vision_graph_data.pkl", 'rb') as f:
    df = pickle.load(f)

# df = pd.read_pickle("data/vision_graph_data.pkl")

# df.platform

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
cuda gpu is available: False


In [2]:
def is_not_reddit_or_twitter(text):
    if text == "reddit" or text == "twitter":
        return False
    else:
        return True

# remove non-features
current = df.current
beam_node = df.beam_node
platform = df.platform
platform.value_counts()

to_delete_rows = platform.apply(lambda x: is_not_reddit_or_twitter(x))
df = df.drop(df[to_delete_rows].index)

df.describe()

Unnamed: 0,timedelta,root_distance_0,y,current,beam_node,has_followed_path,has_follow_path,conversation_id,reply_distance_2,reply_distance_3,...,reply_distance_40,root_distance_32,root_distance_33,root_distance_34,root_distance_35,root_distance_36,root_distance_37,root_distance_38,root_distance_39,root_distance_40
count,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,...,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0,3419533.0
mean,42742.85,0.03969987,0.05875656,1.296209e+18,1.296008e+18,0.0,0.0,1.295209e+18,0.006031525,0.003544929,...,8.773128e-07,5.848752e-06,5.263877e-06,4.679001e-06,4.094126e-06,2.047063e-06,1.754626e-06,1.462188e-06,1.16975e-06,8.773128e-07
std,1113152.0,0.1952532,0.2351685,5.464027e+17,5.464532e+17,0.0,0.0,5.464034e+17,0.07742834,0.05943369,...,0.0009366495,0.002418413,0.002294308,0.002163095,0.002023391,0.001430755,0.001324622,0.001209209,0.00108155,0.0009366495
min,1.0,0.0,0.0,388.0,388.0,0.0,0.0,174503.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2417.0,0.0,0.0,1.493867e+18,1.493522e+18,0.0,0.0,1.490282e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9486.0,0.0,0.0,1.53178e+18,1.531699e+18,0.0,0.0,1.531571e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32393.0,0.0,0.0,1.539142e+18,1.539004e+18,0.0,0.0,1.53889e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,214871500.0,1.0,1.0,1.552563e+18,1.552563e+18,0.0,0.0,1.552559e+18,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
non_feature_list = ["current", "beam_node", "conversation_id", "platform", "has_followed_path", "has_follow_path"]
def take_features(df):
    df = df.drop(non_feature_list, axis=1)
    return df

def take_non_features(df):
    column_names = df.columns.values
    feature_list = [column_name for column_name in column_names if column_name not in non_feature_list]
    df = df.drop(feature_list, axis=1)
    return df

In [4]:

# training functions

def train_model(df):
    # dropping non-reddit non-twitter data
    df = take_features(df)
    # df = df[df["root_distance_0"] == 0]

    # normalize timedelta (put between 0 and 1)
    dt = df.timedelta
    timedelta_normalized = (dt - dt.min()) / (dt.max() - dt.min())
    df.timedelta = timedelta_normalized

    # selecting train and test datasets
    train, test = train_test_split(df, test_size=0.2)
    train.describe()

    # train the model
    y = train.y
    x = train.drop("y", axis=1)
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model

    print(tf.__version__)
    input_shape = (x.shape[1],)
    model = Sequential([
        Dense(1, activation='sigmoid', input_shape=input_shape)
    ])

    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer='sgd',
        loss='binary_crossentropy',
        metrics=['accuracy', 'mae']
    )

    model.fit(x, y, epochs=3)
    # model.fit(x, y)
    # evaluate the model on the test set
    test_y = test.y
    test_x = test.drop("y", axis=1)

    loss, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

    return x, y, test_x, test_y, model


def inspect_model(x, y, test_x, test_y, model):
    # have a look at some prediction
    # reply_distance_2 = test_x[test_x["reply_distance_2"] == 1]
    # first_rows = reply_distance_2.head(2)
    # print(first_rows)
    # model.predict(first_rows)

    # let's have a look at the weights and biases of the hidden layer
    first_layer_weights = model.layers[0].get_weights()[0]
    first_layer_biases = model.layers[0].get_weights()[1]
    # print(first_layer_weights)
    column_names = x.columns.values
    for i in range(len(column_names[:5])):
        print("feature {} has weight {} \n".format(column_names[i], first_layer_weights[i]))


In [5]:
# have a look for reddit
tw_df = df[df["platform"] == "twitter"]
tw_x, tw_y, tw_test_x, tw_test_y, tw_model = train_model(tw_df)

(2323792, 81)
(2323792,)
2.9.1
Epoch 1/3
Epoch 2/3
Epoch 3/3
the accuracy on the training set is 0.9862655401229858 and the mae is 0.026737768203020096


In [6]:
rd_df = df[df["platform"] == "reddit"]
rd_x, rd_y, rd_test_x, rd_test_y, rd_model = train_model(rd_df)

(411834, 81)
(411834,)
2.9.1
Epoch 1/3
Epoch 2/3
Epoch 3/3
the accuracy on the training set is 0.8972212076187134 and the mae is 0.1823420524597168


In [7]:
# inspect_model(tw_x, tw_y, tw_test_x, tw_test_y, tw_model)
# inspect_model(rd_x, rd_y, rd_test_x, rd_test_y, rd_model)

In [8]:
# rd_sample_size = rd_df.shape[0]
# tw_df = tw_df.sample(n=rd_sample_size)


tw_non_features = take_non_features(tw_df)
rd_non_features = take_non_features(rd_df)
tw_non_features

Unnamed: 0,current,beam_node,has_followed_path,has_follow_path,platform,conversation_id
0,1540384238922616835,1540323979793137664,0,0,twitter,1540323979793137664
1,1540384238922616835,1540379281045524481,0,0,twitter,1540323979793137664
2,1540384238922616835,1540337291478138882,0,0,twitter,1540323979793137664
3,1540384238922616835,1540332248561483781,0,0,twitter,1540323979793137664
4,1540384238922616835,1540328476959449089,0,0,twitter,1540323979793137664
...,...,...,...,...,...,...
3419843,1533627567378518016,1533623769449005058,0,0,twitter,1533622269360087046
3419844,1533627567378518016,1533623229482340352,0,0,twitter,1533622269360087046
3419845,1533623769449005058,1533622269360087046,0,0,twitter,1533622269360087046
3419846,1533623769449005058,1533623229482340352,0,0,twitter,1533622269360087046


In [16]:

tw_features_y = take_features(tw_df)
tw_features = tw_features_y.drop("y", axis=1)
rd_features_y = take_features(rd_df)
rd_features = rd_features_y.drop("y", axis=1)
rd_predictions = rd_model.predict(rd_features, batch_size=16)
tw_predictions = tw_model.predict(tw_features, batch_size=16)
tw_vision = tw_non_features.assign(predictions=tw_predictions)
rd_vision = rd_non_features.assign(predictions=rd_predictions)
#tw_vision




In [10]:
combined_vision = tw_vision.append(rd_vision)
combined_vision


not_needed_list = ["beam_node", "has_followed_path", "has_follow_path"]
combined_vision = combined_vision.drop(not_needed_list, axis=1)
combined_vision

Unnamed: 0,current,platform,conversation_id,predictions
0,1540384238922616835,twitter,1540323979793137664,1.0
1,1540384238922616835,twitter,1540323979793137664,1.0
2,1540384238922616835,twitter,1540323979793137664,1.0
3,1540384238922616835,twitter,1540323979793137664,1.0
4,1540384238922616835,twitter,1540323979793137664,1.0
...,...,...,...,...
3418060,68851107,reddit,12994552,0.0
3418061,68851107,reddit,12994552,0.0
3418062,68851107,reddit,12994552,0.0
3418063,68851107,reddit,12994552,0.0


In [11]:
gpm = combined_vision.groupby(["platform", "conversation_id", "current"]).mean()
gpm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,predictions
platform,conversation_id,current,Unnamed: 3_level_1
reddit,174503,4541493,2.245510e-25
reddit,174503,5662402,0.000000e+00
reddit,174503,8711684,1.263091e-11
reddit,174503,10143803,0.000000e+00
reddit,174503,11735997,2.345734e-06
...,...,...,...
twitter,1552558905275858946,1552563036443099137,9.688193e-01
twitter,1552558905275858946,1552563089635250176,9.107236e-01
twitter,1552558905275858946,1552563118403993600,8.551736e-01
twitter,1552558905275858946,1552563228034715649,8.164635e-01


In [12]:
gpm_per_conversation = gpm.groupby(by=["platform", "conversation_id"]).mean()
gpm_per_conversation

Unnamed: 0_level_0,Unnamed: 1_level_0,predictions
platform,conversation_id,Unnamed: 2_level_1
reddit,174503,5.167051e-05
reddit,203904,1.832193e-04
reddit,209098,2.287960e-09
reddit,313699,5.453641e-09
reddit,471878,2.682920e-02
...,...,...
twitter,1552396506501713925,9.034465e-01
twitter,1552551318866120704,8.295437e-01
twitter,1552558203963154434,7.849278e-01
twitter,1552558701432774659,8.330246e-01


In [14]:
gpm_per_platform = gpm.groupby(by=["platform"]).mean()
gpm_per_platform



Unnamed: 0_level_0,predictions
platform,Unnamed: 1_level_1
reddit,0.00233
twitter,0.94471


In [23]:
combined_vision = combined_vision.drop(["predictions"], axis=1)


KeyError: "['predictions'] not found in axis"

In [31]:
author_count_columns = ["current", "conversation_id", "platform", "y"]
author_df = df[author_count_columns]
author_df

Unnamed: 0,current,conversation_id,platform,y
0,1540384238922616835,1540323979793137664,twitter,1
1,1540384238922616835,1540323979793137664,twitter,0
2,1540384238922616835,1540323979793137664,twitter,0
3,1540384238922616835,1540323979793137664,twitter,0
4,1540384238922616835,1540323979793137664,twitter,0
...,...,...,...,...
3419843,1533627567378518016,1533622269360087046,twitter,0
3419844,1533627567378518016,1533622269360087046,twitter,0
3419845,1533623769449005058,1533622269360087046,twitter,1
3419846,1533623769449005058,1533622269360087046,twitter,0


In [34]:
author_df = author_df.groupby(["platform", "conversation_id", "current"]).sum()
author_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,y
platform,conversation_id,current,Unnamed: 3_level_1
reddit,174503,4541493,2
reddit,174503,5662402,5
reddit,174503,8711684,4
reddit,174503,10143803,2
reddit,174503,11735997,11
...,...,...,...
twitter,1552558905275858946,1552563036443099137,1
twitter,1552558905275858946,1552563089635250176,1
twitter,1552558905275858946,1552563118403993600,1
twitter,1552558905275858946,1552563228034715649,2


In [36]:
author_df = author_df.groupby(["platform", "conversation_id"]).mean()
author_df


Unnamed: 0_level_0,Unnamed: 1_level_0,y
platform,conversation_id,Unnamed: 2_level_1
reddit,174503,4.244444
reddit,203904,2.810811
reddit,209098,1.875000
reddit,313699,1.230769
reddit,471878,3.285714
...,...,...
twitter,1552396506501713925,1.000000
twitter,1552551318866120704,1.869565
twitter,1552558203963154434,1.000000
twitter,1552558701432774659,2.555556


In [39]:
distinct_view_columns = ["current", "conversation_id", "platform"]
distinct_view_df = df[distinct_view_columns]
distinct_view_df

Unnamed: 0,current,conversation_id,platform
0,1540384238922616835,1540323979793137664,twitter
1,1540384238922616835,1540323979793137664,twitter
2,1540384238922616835,1540323979793137664,twitter
3,1540384238922616835,1540323979793137664,twitter
4,1540384238922616835,1540323979793137664,twitter
...,...,...,...
3419843,1533627567378518016,1533622269360087046,twitter
3419844,1533627567378518016,1533622269360087046,twitter
3419845,1533623769449005058,1533622269360087046,twitter
3419846,1533623769449005058,1533622269360087046,twitter


In [50]:
distinct_views = distinct_view_df.groupby(["current", "conversation_id", "platform"]).size().to_frame('size')
distinct_views = distinct_views.groupby(["platform", "conversation_id"]).mean()
distinct_views


Unnamed: 0_level_0,Unnamed: 1_level_0,size
platform,conversation_id,Unnamed: 2_level_1
reddit,174503,23.000000
reddit,203904,19.000000
reddit,209098,8.500000
reddit,313699,7.000000
reddit,471878,11.000000
...,...,...
twitter,1552396506501713925,3.000000
twitter,1552551318866120704,11.956522
twitter,1552558203963154434,3.000000
twitter,1552558701432774659,5.000000


In [54]:
joined_author_stats = author_df.join(distinct_views)
joined_author_stats = joined_author_stats["y"]/joined_author_stats["size"]
joined_author_stats

platform  conversation_id    
reddit    174503                 0.184541
          203904                 0.147937
          209098                 0.220588
          313699                 0.175824
          471878                 0.298701
                                   ...   
twitter   1552396506501713925    0.333333
          1552551318866120704    0.156364
          1552558203963154434    0.333333
          1552558701432774659    0.511111
          1552558905275858946    0.137255
Length: 4287, dtype: float64

In [55]:
repetition_probability = joined_author_stats.groupby("platform").mean()
repetition_probability

platform
reddit     0.229424
twitter    0.096224
dtype: float64