# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote


In [24]:
import pandas as pd

df = pd.read_pickle("data/vision_graph_data.pkl")

df.describe()


Unnamed: 0,reply_distance_2,reply_distance_3,timedelta,root_distance_0,root_distance_1,root_distance_2,y,current,beam_node,has_followed_path,has_follow_path
count,7418172.0,7418172.0,7418172.0,7418172.0,7418172.0,7418172.0,7418172.0,7418172.0,7418172.0,7418172.0,7418172.0
mean,0.01136304,0.0006845352,31759.86,0.01017137,0.9018925,0.03807852,0.0154929,1.448474e+18,1.448347e+18,0.0,0.0
std,0.1059902,0.02615467,302337.2,0.100339,0.2974599,0.1913859,0.1235025,3.204302e+17,3.203389e+17,0.0,0.0
min,0.0,0.0,1e-06,0.0,0.0,0.0,0.0,48819.0,48819.0,0.0,0.0
25%,0.0,0.0,3462.0,0.0,1.0,0.0,0.0,1.509801e+18,1.509693e+18,0.0,0.0
50%,0.0,0.0,11970.0,0.0,1.0,0.0,0.0,1.518589e+18,1.518371e+18,0.0,0.0
75%,0.0,0.0,34454.0,0.0,1.0,0.0,0.0,1.524345e+18,1.524051e+18,0.0,0.0
max,1.0,1.0,110303300.0,1.0,1.0,1.0,1.0,7.69458e+18,7.69458e+18,0.0,0.0


In [25]:

# remove non-features
current = df.current
beam_node = df.beam_node
platform = df.platform

platform.value_counts()

twitter    7073061
reddit      345030
delab           81
Name: platform, dtype: int64

In [13]:
df = df[df["platform"] == "reddit"]

df = df.drop(["current", "beam_node", "platform"], axis=1)

# normalize timedelta (put between 0 and 1)
dt = df.timedelta
timedelta_normalized = (dt - dt.min()) / (dt.max() - dt.min())
df.timedelta = timedelta_normalized

df

Unnamed: 0,reply_distance_2,reply_distance_3,timedelta,root_distance_0,root_distance_1,root_distance_2,y,has_followed_path,has_follow_path
81,1,0,0.000067,1,0,0,1,0,0
82,0,0,0.000063,0,1,0,0,0,0
83,0,0,0.000062,0,1,0,0,0,0
84,0,0,0.000057,0,1,0,0,0,0
85,0,0,0.000049,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
345106,0,0,0.004892,0,0,1,0,0,0
345107,0,0,0.004982,0,1,0,0,0,0
345108,0,0,0.001560,0,1,0,0,0,0
345109,0,0,0.000668,0,1,0,0,0,0


In [14]:
from sklearn.model_selection import train_test_split

# selecting train and test datasets
train, test = train_test_split(df, test_size=0.2)
train.describe()


Unnamed: 0,reply_distance_2,reply_distance_3,timedelta,root_distance_0,root_distance_1,root_distance_2,y,has_followed_path,has_follow_path
count,276024.0,276024.0,276024.0,276024.0,276024.0,276024.0,276024.0,276024.0,276024.0
mean,0.014897,0.003764,0.005571,0.008869,0.369283,0.236095,0.047862,0.0,0.0
std,0.121142,0.061237,0.017903,0.093756,0.482612,0.424682,0.213474,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.000823,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.00227,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.006829,0.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


In [15]:
# train the model
y = train.y
x = train.drop("y", axis=1)
print(x.shape)
print(y.shape)


(276024, 8)
(276024,)


In [16]:
# convert train dataset to numpy arrays not sure this is neded
#x = x.to_numpy()
#y = y.to_numpy()
#print(x[:5])
#print(y[:5])

In [17]:
# import tensorflow and train the model
import tensorflow as tf

print(tf.__version__)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

input_shape = (x.shape[1],)
model = Sequential([
    Dense(1, activation='sigmoid', input_shape=input_shape)
])

# stachistical gradient descend as a classifier seem appropriate
model.compile(
    optimizer='sgd',
    loss='binary_crossentropy',
    metrics=['accuracy', 'mae']
)

model.fit(x, y)


2.6.0


<keras.callbacks.History at 0x7fee881e29a0>

In [18]:
# evaluate the model on the test set
test_y = test.y
test_x = test.drop("y", axis=1)

In [19]:
# convert train dataset to numpy arrays not sure this is neded
# test_x = test_x.to_numpy()
# test_y = test_y.to_numpy()

In [20]:
loss, accuracy, mae = model.evaluate(test_x, test_y)
print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

the accuracy on the training set is 0.9524968862533569 and the mae is 0.09138570725917816


In [21]:
# have a look at some prediction
reply_distance_2 = test[test["reply_distance_2"] == 1]
reply_distance_2 = reply_distance_2.drop("y", axis=1)
first_rows = reply_distance_2.head(2)
print(first_rows)
model.predict(first_rows)

        reply_distance_2  reply_distance_3  timedelta  root_distance_0  \
207641                 1                 0   0.000020                0   
304599                 1                 0   0.011906                1   

        root_distance_1  root_distance_2  has_followed_path  has_follow_path  
207641                1                0                  0                0  
304599                0                0                  0                0  


array([[0.04834182],
       [0.07979932]], dtype=float32)

By intuition, this result makes sense:
- In the first example the beam node is two removed from the root where in the second
example the it is the root node itself.
- Also the time elapsed is shorted (the time delta is normalized so how much time that is I am not sure

For these reasons ...
 - the second sample has a probability of 96% of having been seen
 - the first sample only has a probability of 56% of having been seen (which is still high)

In [22]:
# let's have a look at the weights and biases of the hidden layer

first_layer_weights = model.layers[0].get_weights()[0]
first_layer_biases  = model.layers[0].get_weights()[1]
print(first_layer_weights)
column_names = x.columns.values
for i in range(len(column_names)):
    print("feature {} has weight {} \n" .format(column_names[i], first_layer_weights[i]))



[[ 0.18435887]
 [-0.4370069 ]
 [-0.73888505]
 [ 0.00917221]
 [-0.53444296]
 [-0.796597  ]
 [ 0.15518367]
 [ 0.29669797]]
feature reply_distance_2 has weight [0.18435887] 

feature reply_distance_3 has weight [-0.4370069] 

feature timedelta has weight [-0.73888505] 

feature root_distance_0 has weight [0.00917221] 

feature root_distance_1 has weight [-0.53444296] 

feature root_distance_2 has weight [-0.796597] 

feature has_followed_path has weight [0.15518367] 

feature has_follow_path has weight [0.29669797] 

