In [40]:
import nvidia_smi

nvidia_smi.nvmlInit()

handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
# card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)

nvidia_smi.nvmlShutdown()

Total memory: 4104323072
Free memory: 876019712
Used memory: 3228303360


In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.set_logical_device_configuration(
            gpus[0],
            [tf.config.LogicalDeviceConfiguration(memory_limit=2024)])
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))
with open("data/vision_forward_graph_data.pkl", 'rb') as f:
    df = pickle.load(f)

df.describe()

1 Physical GPUs, 1 Logical GPUs
cuda gpu is available: True


2022-08-04 15:55:20.892102: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 15:55:20.892460: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 15:55:20.892694: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 15:55:20.892990: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 15:55:20.893228: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

Unnamed: 0,timedelta,root_distance_0,current,beam_node,has_followed_path,has_follow_path,conversation_id,author,reply_distance_2,reply_distance_3,...,root_distance_14,root_distance_15,root_distance_16,root_distance_17,root_distance_18,root_distance_19,root_distance_20,root_distance_21,root_distance_22,root_distance_23
count,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,...,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0
mean,53696.46,0.038011,1.48144e+18,1.481198e+18,0.0,0.0,1.480454e+18,8.815487e+17,0.005518,0.002841,...,0.000254,0.000278,0.000184,0.000111,8.1e-05,7.3e-05,6.5e-05,9.5e-05,8.1e-05,7.3e-05
std,897886.2,0.191222,2.648856e+17,2.641527e+17,0.0,0.0,2.624457e+17,6.254187e+17,0.074081,0.053229,...,0.015947,0.016666,0.013563,0.010541,0.009027,0.008535,0.00809,0.009751,0.009027,0.008535
min,1e-06,0.0,215118.0,215118.0,0.0,0.0,1.0,42.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2053.0,0.0,1.511883e+18,1.511768e+18,0.0,0.0,1.511628e+18,2515953000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8869.0,0.0,1.524812e+18,1.524722e+18,0.0,0.0,1.524646e+18,1.18928e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,33657.5,0.0,1.543267e+18,1.543217e+18,0.0,0.0,1.542981e+18,1.427657e+18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,110303300.0,1.0,7.69458e+18,7.69458e+18,0.0,0.0,1.544234e+18,1.544089e+18,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [42]:
# importing utility functions
%run author_vision_util.ipynb

In [43]:
df = equalize_samples(df)
df = df[df["platform"] == "reddit"]
df.head(2)

chosen 24 conversations and gotten 26737 from twitter compared to 24497 from reddit


Unnamed: 0,timedelta,root_distance_0,current,beam_node,has_followed_path,has_follow_path,platform,conversation_id,author,reply_distance_2,...,root_distance_14,root_distance_15,root_distance_16,root_distance_17,root_distance_18,root_distance_19,root_distance_20,root_distance_21,root_distance_22,root_distance_23
44651,7208.0,1,19827662,50260952,0,0,reddit,50260952,22862703,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44652,9433.0,1,19750932,50260952,0,0,reddit,50260952,77299590,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:


author_one_hot = pd.get_dummies(df.author, prefix="Author", sparse=True)
# author_one_hot.to_pickle("data/forward_authors_encodin.pkl")


In [45]:
author_one_hot.shape



(24497, 406)

In [46]:
features = take_features(df)
features = features.drop("author", axis=1)
features.shape


(24497, 47)

In [None]:
combined_set = features.join(author_one_hot)
combined_set.shape


In [63]:
# compute a fake user that symbolizes that the given user has not been seen at a given stage in the conversation
df_conversation_authors = df[["conversation_id", "author"]]
df_conversation_authors = df_conversation_authors.groupby(["conversation_id", "author"]).size().to_frame("n_posts").fillna(0)
#min(1, df_conversation_authors.n_posts)
df_conversation_authors


Unnamed: 0_level_0,Unnamed: 1_level_0,n_posts
conversation_id,author,Unnamed: 2_level_1
661614,4527514,3
661614,32464930,2
661614,50774586,4
661614,77585579,6
661614,94186704,1
...,...,...
97897532,92577819,6
97897532,94656090,3
97897532,96471467,2
97897532,96936451,11


In [48]:
from keras.layers import Dropout
from keras.optimizer_v2.rmsprop import RMSprop  # selecting train and test datasets
train, test = train_test_split(combined_set, test_size=0.2, shuffle=True)
print("split training and test set")

# train the model
y = train.drop(features.columns, axis=1)
x = train.drop(author_one_hot.columns, axis=1)
print("seperated features and y with shapes (x,y)")
print(x.shape)
print(y.shape)

# import tensorflow and train the model
# print(tf.__version__)
input_shape = (x.shape[1],)
output_shape = y.shape[1]
print("inputshape is {}".format(input_shape))
model = Sequential([
    Dense(output_shape, activation='relu', input_shape=input_shape),
    Dropout(0.2),
    Dense(output_shape, activation='relu'),
    Dense(output_shape, activation='softmax', input_shape=input_shape)
])
print("defined model as {}".format(model.layers))
# stochastic gradient descend as a classifier seem appropriate
model.compile(
    optimizer=RMSprop(),
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy', 'mae']
)
print("compiled model")
# model.fit(x, y, epochs=3)
model.fit(x, y, epochs=10, shuffle=True)
# evaluate the model on the test set
test_y = test.drop(features.columns, axis=1)
test_x = test.drop(author_one_hot.columns, axis=1)

loss, accuracy, mae = model.evaluate(test_x, test_y)
print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))



split training and test set
seperated features and y with shapes (x,y)
(19597, 47)
(19597, 406)
inputshape is (47,)
defined model as [<keras.layers.core.Dense object at 0x7f86ea0b6d90>, <keras.layers.core.Dropout object at 0x7f86ea0b6910>, <keras.layers.core.Dense object at 0x7f86ea0b64f0>, <keras.layers.core.Dense object at 0x7f86b445cdc0>]
compiled model
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
the accuracy on the training set is 0.05000000074505806 and the mae is 0.004864760208874941
