# Training a classifier for weights of author vision components

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote


In [157]:
import pandas as pd

df = pd.read_pickle("data/vision_graph_data.pkl")

df.describe()


Unnamed: 0,reply_distance_2,reply_distance_3,reply_distance_4,timedelta,root_distance_0,root_distance_1,root_distance_2,root_distance_3,y,current,beam_node,has_followed_path,has_follow_path
count,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0,3157800.0
mean,0.005017101,0.003026791,0.00200931,38500.14,0.0389575,0.8457949,0.04832637,0.02347457,0.05293559,1.39598e+18,1.39591e+18,0.0,0.0
std,0.0706536,0.05493296,0.04478028,1040748.0,0.1934937,0.3611453,0.214455,0.1514052,0.223905,4.266853e+17,4.265792e+17,0.0,0.0
min,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,388.0,388.0,0.0,0.0
25%,0.0,0.0,0.0,2324.0,0.0,1.0,0.0,0.0,0.0,1.526622e+18,1.526541e+18,0.0,0.0
50%,0.0,0.0,0.0,9048.0,0.0,1.0,0.0,0.0,0.0,1.532639e+18,1.532475e+18,0.0,0.0
75%,0.0,0.0,0.0,31291.0,0.0,1.0,0.0,0.0,0.0,1.539539e+18,1.539503e+18,0.0,0.0
max,1.0,1.0,1.0,214871500.0,1.0,1.0,1.0,1.0,1.0,1.55226e+18,1.552259e+18,0.0,0.0


In [158]:

# remove non-features
current = df.current
beam_node = df.beam_node
platform = df.platform

platform.value_counts()

twitter    2889148
reddit      268337
delab          315
Name: platform, dtype: int64

In [159]:
df = df[df["platform"] == "twitter"]
# df = df[df["root_distance_0"] == 0]

df = df.drop(["current", "beam_node", "platform", "has_followed_path", "has_follow_path"], axis=1)

# normalize timedelta (put between 0 and 1)
dt = df.timedelta
timedelta_normalized = (dt - dt.min()) / (dt.max() - dt.min())
df.timedelta = timedelta_normalized
df

Unnamed: 0,reply_distance_2,reply_distance_3,reply_distance_4,timedelta,root_distance_0,root_distance_1,root_distance_2,root_distance_3,y
0,0,0,0,6.114817e-05,1,0,0,0,1
1,0,0,0,5.165878e-07,0,1,0,0,0
2,0,0,0,1.903463e-06,0,1,0,0,0
3,0,0,0,6.599292e-06,0,1,0,0,0
4,0,0,0,1.188617e-05,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
3157795,0,0,0,1.912771e-06,0,1,0,0,0
3157796,0,0,0,2.452628e-06,0,1,0,0,0
3157797,0,0,0,1.433415e-06,1,0,0,0,1
3157798,0,0,0,5.352035e-07,0,1,0,0,0


In [160]:
from sklearn.model_selection import train_test_split

# selecting train and test datasets
train, test = train_test_split(df, test_size=0.2)
train.describe()


Unnamed: 0,reply_distance_2,reply_distance_3,reply_distance_4,timedelta,root_distance_0,root_distance_1,root_distance_2,root_distance_3,y
count,2311318.0,2311318.0,2311318.0,2311318.0,2311318.0,2311318.0,2311318.0,2311318.0,2311318.0
mean,0.003042853,0.001868198,0.0012945,0.0001552376,0.03847026,0.8854636,0.03169014,0.01394269,0.04787788
std,0.05507808,0.04318227,0.03595587,0.004860642,0.1923287,0.3184617,0.1751739,0.1172532,0.2135079
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.018283e-05,0.0,1.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,3.930255e-05,0.0,1.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.000139502,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [161]:
# train the model
y = train.y
x = train.drop("y", axis=1)
print(x.shape)
print(y.shape)


(2311318, 8)
(2311318,)


In [162]:
# convert train dataset to numpy arrays not sure this is neded
#x = x.to_numpy()
#y = y.to_numpy()
#print(x[:5])
#print(y[:5])

In [163]:
# import tensorflow and train the model
import tensorflow as tf

print(tf.__version__)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

input_shape = (x.shape[1],)
model = Sequential([
    Dense(1, activation='sigmoid', input_shape=input_shape)
])

# stachistical gradient descend as a classifier seem appropriate
model.compile(
    optimizer='sgd',
    loss='binary_crossentropy',
    metrics=['accuracy', 'mae']
)

model.fit(x, y, epochs=3)


2.6.0
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f2db20bf0d0>

In [164]:
# evaluate the model on the test set
test_y = test.y
test_x = test.drop("y", axis=1)

In [165]:
# convert train dataset to numpy arrays not sure this is neded
# test_x = test_x.to_numpy()
# test_y = test_y.to_numpy()

In [166]:
loss, accuracy, mae = model.evaluate(test_x, test_y)
print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

the accuracy on the training set is 0.9868577122688293 and the mae is 0.02454851195216179


In [167]:
# have a look at some prediction
reply_distance_2 = test[test["reply_distance_2"] == 1]
reply_distance_2 = reply_distance_2.drop("y", axis=1)
first_rows = reply_distance_2.head(2)
print(first_rows)
model.predict(first_rows)

        reply_distance_2  reply_distance_3  reply_distance_4  timedelta  \
900139                 1                 0                 0   0.000197   
695903                 1                 0                 0   0.000037   

        root_distance_0  root_distance_1  root_distance_2  root_distance_3  
900139                1                0                0                0  
695903                1                0                0                0  


array([[0.8454637],
       [0.8454683]], dtype=float32)

In [168]:
# let's have a look at the weights and biases of the hidden layer

first_layer_weights = model.layers[0].get_weights()[0]
first_layer_biases  = model.layers[0].get_weights()[1]
print(first_layer_weights)
column_names = x.columns.values
for i in range(len(column_names)):
    print("feature {} has weight {} \n" .format(column_names[i], first_layer_weights[i]))



[[-1.1888628 ]
 [ 1.213496  ]
 [-0.6482789 ]
 [-0.21612737]
 [ 4.621445  ]
 [-3.6116598 ]
 [-1.2800721 ]
 [-0.6761108 ]]
feature reply_distance_2 has weight [-1.1888628] 

feature reply_distance_3 has weight [1.213496] 

feature reply_distance_4 has weight [-0.6482789] 

feature timedelta has weight [-0.21612737] 

feature root_distance_0 has weight [4.621445] 

feature root_distance_1 has weight [-3.6116598] 

feature root_distance_2 has weight [-1.2800721] 

feature root_distance_3 has weight [-0.6761108] 

