# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote


In [25]:
import pandas as pd

df = pd.read_pickle("data/vision_graph_data.pkl")

df.describe()


Unnamed: 0,timedelta,root_distance_0,y,current,beam_node,has_followed_path,has_follow_path,reply_distance_2,reply_distance_3,root_distance_1,...,root_distance_31,root_distance_32,root_distance_33,root_distance_34,root_distance_35,root_distance_36,root_distance_37,root_distance_38,root_distance_39,root_distance_40
count,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,...,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0,3419848.0
mean,42739.64,0.03969972,0.05875466,1.29609e+18,1.296024e+18,0.0,0.0,0.006033017,0.003545772,0.1295455,...,8.77232e-06,5.848213e-06,5.263392e-06,4.678571e-06,4.093749e-06,2.046875e-06,1.754464e-06,1.462053e-06,1.169643e-06,8.77232e-07
std,1113101.0,0.1952528,0.235165,5.465191e+17,5.464306e+17,0.0,0.0,0.07743785,0.05944073,0.3358028,...,0.002961798,0.002418301,0.002294203,0.002162996,0.002023298,0.001430689,0.001324561,0.001209153,0.0010815,0.0009366064
min,0.001,0.0,0.0,388.0,388.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2417.0,0.0,0.0,1.493856e+18,1.493481e+18,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9485.0,0.0,0.0,1.531777e+18,1.531699e+18,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32391.0,0.0,0.0,1.539141e+18,1.539003e+18,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,214871500.0,1.0,1.0,1.552563e+18,1.552563e+18,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:

# remove non-features
current = df.current
beam_node = df.beam_node
platform = df.platform

platform.value_counts()

twitter    2904740
reddit      514793
delab          315
Name: platform, dtype: int64

In [27]:
df = df[df["platform"] == "reddit"]
# df = df[df["root_distance_0"] == 0]

df = df.drop(["current", "beam_node", "platform", "has_followed_path", "has_follow_path"], axis=1)

# normalize timedelta (put between 0 and 1)
dt = df.timedelta
timedelta_normalized = (dt - dt.min()) / (dt.max() - dt.min())
df.timedelta = timedelta_normalized
df

Unnamed: 0,timedelta,root_distance_0,y,reply_distance_2,reply_distance_3,root_distance_1,root_distance_2,root_distance_3,reply_distance_4,root_distance_4,...,root_distance_31,root_distance_32,root_distance_33,root_distance_34,root_distance_35,root_distance_36,root_distance_37,root_distance_38,root_distance_39,root_distance_40
1815,0.000009,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1816,0.000030,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1817,0.000022,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1818,0.000076,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1819,0.000067,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3418447,0.000106,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3418448,0.000105,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3418449,0.000104,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3418450,0.000102,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
from sklearn.model_selection import train_test_split

# selecting train and test datasets
train, test = train_test_split(df, test_size=0.2)
train.describe()


Unnamed: 0,timedelta,root_distance_0,y,reply_distance_2,reply_distance_3,root_distance_1,root_distance_2,root_distance_3,reply_distance_4,root_distance_4,...,root_distance_31,root_distance_32,root_distance_33,root_distance_34,root_distance_35,root_distance_36,root_distance_37,root_distance_38,root_distance_39,root_distance_40
count,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,...,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0,411834.0
mean,0.000626,0.047274,0.120095,0.024775,0.014115,0.396383,0.220931,0.115945,0.008401,0.058516,...,1.9e-05,1.7e-05,2.4e-05,1.7e-05,1.5e-05,1.2e-05,7e-06,7e-06,7e-06,5e-06
std,0.008741,0.212224,0.325072,0.155438,0.117965,0.489146,0.414875,0.320159,0.091274,0.234717,...,0.004407,0.004123,0.004928,0.004123,0.003817,0.003484,0.002699,0.002699,0.002699,0.002204
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.000117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.000317,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# train the model
y = train.y
x = train.drop("y", axis=1)
print(x.shape)
print(y.shape)


(411834, 81)
(411834,)


In [30]:
# convert train dataset to numpy arrays not sure this is neded
#x = x.to_numpy()
#y = y.to_numpy()
#print(x[:5])
#print(y[:5])

In [31]:
# import tensorflow and train the model
import tensorflow as tf

print(tf.__version__)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

input_shape = (x.shape[1],)
model = Sequential([
    Dense(1, activation='sigmoid', input_shape=input_shape)
])

# stachistical gradient descend as a classifier seem appropriate
model.compile(
    optimizer='sgd',
    loss='binary_crossentropy',
    metrics=['accuracy', 'mae']
)

model.fit(x, y, epochs=3)


2.6.0
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f82f01ca700>

In [32]:
# evaluate the model on the test set
test_y = test.y
test_x = test.drop("y", axis=1)

In [33]:
# convert train dataset to numpy arrays not sure this is neded
# test_x = test_x.to_numpy()
# test_y = test_y.to_numpy()

In [34]:
loss, accuracy, mae = model.evaluate(test_x, test_y)
print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

the accuracy on the training set is 0.8986101150512695 and the mae is 0.18156251311302185


In [35]:
# have a look at some prediction
reply_distance_2 = test[test["reply_distance_2"] == 1]
reply_distance_2 = reply_distance_2.drop("y", axis=1)
first_rows = reply_distance_2.head(2)
print(first_rows)
model.predict(first_rows)

         timedelta  root_distance_0  reply_distance_2  reply_distance_3  \
733826    0.000167                0               1.0               0.0   
1996962   0.000031                0               1.0               0.0   

         root_distance_1  root_distance_2  root_distance_3  reply_distance_4  \
733826               1.0              0.0              0.0               0.0   
1996962              1.0              0.0              0.0               0.0   

         root_distance_4  reply_distance_5  ...  root_distance_31  \
733826               0.0               0.0  ...               0.0   
1996962              0.0               0.0  ...               0.0   

         root_distance_32  root_distance_33  root_distance_34  \
733826                0.0               0.0               0.0   
1996962               0.0               0.0               0.0   

         root_distance_35  root_distance_36  root_distance_37  \
733826                0.0               0.0               0.0   

array([[0.04649165],
       [0.04649243]], dtype=float32)

In [36]:
# let's have a look at the weights and biases of the hidden layer

first_layer_weights = model.layers[0].get_weights()[0]
first_layer_biases  = model.layers[0].get_weights()[1]
print(first_layer_weights)
column_names = x.columns.values
for i in range(len(column_names)):
    print("feature {} has weight {} \n" .format(column_names[i], first_layer_weights[i]))



[[-1.30677640e-01]
 [ 2.28805804e+00]
 [-8.17832470e-01]
 [ 1.75102639e+00]
 [-3.48071933e-01]
 [-8.52820277e-01]
 [-3.75360668e-01]
 [-2.20548511e-01]
 [-4.99415219e-01]
 [ 7.12498069e-01]
 [ 2.46124957e-02]
 [ 3.55230629e-01]
 [-3.06165874e-01]
 [ 3.23293597e-01]
 [-4.90460247e-02]
 [-6.18301705e-02]
 [-2.22053051e-01]
 [-5.44616431e-02]
 [-1.05244607e-01]
 [-2.05770984e-01]
 [ 1.37624115e-01]
 [ 1.08357072e-01]
 [ 7.39288181e-02]
 [ 1.42271906e-01]
 [ 2.40855590e-01]
 [ 1.89153805e-01]
 [ 1.54219568e-01]
 [ 3.55165899e-02]
 [ 2.46617526e-01]
 [-2.18688861e-01]
 [ 3.05153906e-01]
 [ 2.08589792e-01]
 [-1.57899633e-01]
 [ 1.16328463e-01]
 [-8.16077143e-02]
 [ 3.67255025e-02]
 [-5.92536367e-02]
 [ 1.20465748e-01]
 [ 1.56796187e-01]
 [ 1.99125886e-01]
 [ 2.12041065e-01]
 [ 2.50672549e-01]
 [ 1.05474249e-01]
 [ 1.06483497e-01]
 [-2.41165981e-01]
 [ 1.34024948e-01]
 [ 2.44135305e-01]
 [-9.08605661e-03]
 [-1.88152909e-01]
 [-1.66622818e-01]
 [ 2.15231627e-01]
 [-1.11456305e-01]
 [ 1.1887212