In [124]:
import nvidia_smi

nvidia_smi.nvmlInit()

handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
# card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)

nvidia_smi.nvmlShutdown()

Total memory: 4104323072
Free memory: 876019712
Used memory: 3228303360


In [125]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.set_logical_device_configuration(
            gpus[0],
            [tf.config.LogicalDeviceConfiguration(memory_limit=2024)])
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))
with open("data/vision_forward_graph_data_05_08_22.pkl", 'rb') as f:
    df = pickle.load(f)

df.describe()

1 Physical GPUs, 1 Logical GPUs
cuda gpu is available: True


2022-08-05 16:15:46.661012: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-05 16:15:46.661808: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-05 16:15:46.662400: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-05 16:15:46.662963: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-05 16:15:46.663427: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

Unnamed: 0,reply_distance_2,reply_distance_3,timedelta,root_distance_0,root_distance_1,root_distance_2,root_distance_3,current,beam_node,has_followed_path,...,root_distance_14,root_distance_15,root_distance_16,root_distance_17,root_distance_18,root_distance_19,root_distance_20,root_distance_21,root_distance_22,root_distance_23
count,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,...,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0
mean,0.005518,0.002841,53696.46,0.038011,0.199583,0.064552,0.028038,1.48144e+18,1.481198e+18,0.0,...,0.000254,0.000278,0.000184,0.000111,8.1e-05,7.3e-05,6.5e-05,9.5e-05,8.1e-05,7.3e-05
std,0.074081,0.053229,897886.2,0.191222,0.399687,0.245734,0.165082,2.648856e+17,2.641527e+17,0.0,...,0.015947,0.016666,0.013563,0.010541,0.009027,0.008535,0.00809,0.009751,0.009027,0.008535
min,0.0,0.0,1e-06,0.0,0.0,0.0,0.0,215118.0,215118.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2053.0,0.0,0.0,0.0,0.0,1.511883e+18,1.511768e+18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,8869.0,0.0,0.0,0.0,0.0,1.524812e+18,1.524722e+18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,33657.5,0.0,0.0,0.0,0.0,1.543267e+18,1.543217e+18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,110303300.0,1.0,1.0,1.0,1.0,7.69458e+18,7.69458e+18,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [126]:
# importing utility functions
%run author_vision_util.ipynb

In [127]:
df = equalize_samples(df)
df = df[df["platform"] == "reddit"]
df.shape

chosen 21 conversations and gotten 26524 from twitter compared to 24497 from reddit


(24497, 57)

#### Create a one hot vector representation of the possible authors
- create an artificial user that represents a new user in a conversation up to that point
- get a matrix with the authors as columns and a 1 if the author wrote the post
- join it with the feature matrix
- drop the author column


In [128]:
# compute a fake user that symbolizes that the given user has not been seen at a given stage in the conversation
df_conversation_authors = df[["conversation_id", "author", "current_time"]]
first_times = df_conversation_authors.groupby(["conversation_id", "author"]).min()

def is_new_author(row):
    earliest_author_post = first_times.loc[row["conversation_id"],row["author"]]
    current_post_time = row["current_time"]
    return  earliest_author_post >= current_post_time

new_author_column = df[["conversation_id", "author", "current_time"]].apply(is_new_author, axis=1)
new_author_column= new_author_column.rename(columns={'current_time':"is_new_author"})
#new_author_column.describe()
# current author has not been the beam_node
new_author_column.head(2)

Unnamed: 0,is_new_author
49643,True
49644,True


In [129]:

author_one_hot = pd.get_dummies(df.author, prefix="Author", sparse=True)
author_one_hot = author_one_hot.astype(bool).apply(lambda x: x & ~new_author_column.is_new_author).astype(int)
labels = author_one_hot.join(new_author_column.astype(int))

In [130]:

# author_one_hot.to_pickle("data/forward_authors_encodin.pkl")
features = take_features(df, ["author", "current_time", "beam_node_time"])
#features = features.drop("author", axis=1)
combined_set = features.join(labels)
combined_set.head()

Unnamed: 0,reply_distance_2,reply_distance_3,timedelta,root_distance_0,root_distance_1,root_distance_2,root_distance_3,reply_distance_4,root_distance_4,reply_distance_5,...,Author_96936451,Author_97185894,Author_97589063,Author_98630915,Author_98760724,Author_98781300,Author_99195573,Author_99501227,Author_99879210,is_new_author
49643,0.0,0.0,8417.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
49644,0.0,0.0,22934.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
49645,0.0,0.0,14517.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
49646,0.0,0.0,30452.0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
49647,0.0,0.0,22035.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [131]:
from keras.layers import Dropout
from keras.optimizer_v2.rmsprop import RMSprop  # selecting train and test datasets
train, test = train_test_split(combined_set, test_size=0.2, shuffle=True)
print("split training and test set")

# train the model
y = train.drop(features.columns, axis=1)
x = train.drop(author_one_hot.columns, axis=1)
print("seperated features and y with shapes (x,y)")
print(x.shape)
print(y.shape)

# import tensorflow and train the model
# print(tf.__version__)
input_shape = (x.shape[1],)
output_shape = y.shape[1]
print("inputshape is {}".format(input_shape))
model = Sequential([
    Dense(output_shape, activation='relu', input_shape=input_shape),
    Dropout(0.2),
    Dense(output_shape, activation='relu'),
    Dense(output_shape, activation='softmax', input_shape=input_shape)
])
print("defined model as {}".format(model.layers))
# stochastic gradient descend as a classifier seem appropriate
model.compile(
    optimizer=RMSprop(),
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy', 'mae']
)
print("compiled model")
model.fit(x, y, epochs=3)
#model.fit(x, y, epochs=10, shuffle=True)
# evaluate the model on the test set
test_y = test.drop(features.columns, axis=1)
test_x = test.drop(author_one_hot.columns, axis=1)

loss, accuracy, mae = model.evaluate(test_x, test_y)
print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))



split training and test set
seperated features and y with shapes (x,y)
(19597, 48)
(19597, 407)
inputshape is (48,)
defined model as [<keras.layers.core.Dense object at 0x7fe478e6bc70>, <keras.layers.core.Dropout object at 0x7fe5543a9430>, <keras.layers.core.Dense object at 0x7fe5543a9a90>, <keras.layers.core.Dense object at 0x7fe554073460>]
compiled model
Epoch 1/3
Epoch 2/3
Epoch 3/3
the accuracy on the training set is 0.38408163189888 and the mae is 0.004828783683478832
