In [63]:
import nvidia_smi

nvidia_smi.nvmlInit()

handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
# card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)

nvidia_smi.nvmlShutdown()

Total memory: 4104323072
Free memory: 876019712
Used memory: 3228303360


In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.set_logical_device_configuration(
            gpus[0],
            [tf.config.LogicalDeviceConfiguration(memory_limit=2024)])
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))
with open("data/vision_forward_graph_data_local_05_08_22.pkl", 'rb') as f:
    df = pickle.load(f)

df.describe()

1 Physical GPUs, 1 Logical GPUs
cuda gpu is available: True


2022-08-08 09:44:41.020125: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-08 09:44:41.020451: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-08 09:44:41.020723: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-08 09:44:41.020994: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-08 09:44:41.021218: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

Unnamed: 0,timedelta,root_distance_0,current,beam_node,has_followed_path,has_follow_path,beam_node_author,conversation_id,author,same_author_path_1,...,same_author_path_15,same_author_path_16,same_author_path_18,same_author_path_20,same_author_path_17,same_author_path_19,same_author_path_21,same_author_path_23,same_author_path_25,same_author_path_22
count,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,...,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0,809823.0
mean,53696.46,0.038011,1.48144e+18,1.481198e+18,0.0,0.0,8.72757e+17,1.480454e+18,8.815487e+17,0.001287,...,2.2e-05,9e-06,5e-06,4e-06,1.6e-05,1e-05,4e-06,4e-06,1e-06,1e-06
std,897886.2,0.191222,2.648856e+17,2.641527e+17,0.0,0.0,6.262707e+17,2.624457e+17,6.254187e+17,0.035848,...,0.004715,0.00294,0.002222,0.001925,0.004007,0.003143,0.001925,0.001925,0.001111,0.001111
min,1e-06,0.0,215118.0,215118.0,0.0,0.0,42.0,1.0,42.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2053.0,0.0,1.511883e+18,1.511768e+18,0.0,0.0,2409071000.0,1.511628e+18,2515953000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8869.0,0.0,1.524812e+18,1.524722e+18,0.0,0.0,1.181668e+18,1.524646e+18,1.18928e+18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,33657.5,0.0,1.543267e+18,1.543217e+18,0.0,0.0,1.422915e+18,1.542981e+18,1.427657e+18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,110303300.0,1.0,7.69458e+18,7.69458e+18,0.0,0.0,1.544089e+18,1.544234e+18,1.544089e+18,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [65]:
# importing utility functions
%run author_vision_util.ipynb

In [66]:
df = equalize_samples(df)
df = df[df["platform"] == "twitter"]
df.shape

chosen 20 conversations and gotten 25474 from twitter compared to 24497 from reddit


(25474, 81)

#### Create a one hot vector representation of the possible authors
- create an artificial user that represents a new user in a conversation up to that point
- get a matrix with the authors as columns and a 1 if the author wrote the post
- join it with the feature matrix
- drop the author column


In [67]:
# compute a fake user that symbolizes that the given user has not been seen at a given stage in the conversation
df_conversation_authors = df[["conversation_id", "author", "current_time"]]
first_times = df_conversation_authors.groupby(["conversation_id", "author"]).min()

def is_new_author(row):
    earliest_author_post = first_times.loc[row["conversation_id"],row["author"]]
    current_post_time = row["current_time"]
    return  earliest_author_post >= current_post_time

new_author_column = df[["conversation_id", "author", "current_time"]].apply(is_new_author, axis=1)
new_author_column= new_author_column.rename(columns={'current_time':"is_new_author"})
#new_author_column.describe()
# current author has not been the beam_node
new_author_column.head(2)

Unnamed: 0,is_new_author
63334,True
63335,True


In [68]:
def compute_new_author_column(df):
    author_one_hot = pd.get_dummies(df.author, prefix="Author", sparse=True)
    author_one_hot = author_one_hot.astype(bool).apply(lambda x: x & ~new_author_column.is_new_author).astype(int)
    labels = author_one_hot.join(new_author_column.astype(int))
    features = take_features(df, ["author", "current_time", "beam_node_time"])
    combined_set = features.join(labels)
    return combined_set, features, labels

combined_set, features, labels = compute_new_author_column(df)
combined_set.head()

Unnamed: 0,timedelta,root_distance_0,same_author_path_1,root_distance_1,same_author_path_2,reply_distance_2,reply_distance_3,reply_distance_4,reply_distance_5,reply_distance_6,...,Author_1536124171972972545,Author_1537528480451076096,Author_1538177053274644482,Author_1539516006460444672,Author_1539612047951794176,Author_1540316586115887104,Author_1542056501942124544,Author_1542705767463149568,Author_1543619376213024769,is_new_author
63334,1325.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
63335,95.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
63336,260.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
63337,129.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
63338,690.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


#### Training NN to predict the author that would write next
- included a "new author" category to capture predicting unknown authors
- using multi-class classification (instead of multi-label)
- relu/sigmoid activation functions have same effect
- precision grew significantly when adding more than 3-5 layers

In [70]:
from keras.layers import Dropout
from keras.optimizer_v2.rmsprop import RMSprop  # selecting train and test datasets
train, test = train_test_split(combined_set, test_size=0.2, shuffle=True)
print("split training and test set")

# train the model
y = train.drop(features.columns, axis=1)
x = train.drop(labels.columns, axis=1)
print("seperated features and y with shapes:")
print(x.shape)
print(y.shape)

# import tensorflow and train the model
# print(tf.__version__)
input_shape = (x.shape[1],)
output_shape = y.shape[1]
print("inputshape is {}".format(input_shape))
model = Sequential([
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dropout(0.2),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='softmax', input_shape=input_shape)
])
print("defined model as {}".format(model.layers))
# stochastic gradient descend as a classifier seem appropriate
model.compile(
    optimizer=RMSprop(),
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy', 'accuracy' ,'mae']
)
print("compiled model")
model.fit(x, y, epochs=3)
#model.fit(x, y, epochs=10, shuffle=True)
# evaluate the model on the test set
test_y = test.drop(features.columns, axis=1)
test_x = test.drop(labels.columns, axis=1)

loss, cat_accuracy, accuracy, mae = model.evaluate(test_x, test_y)
print("the accuracy on the training set is cat acc {}, reg acc {} and the mae is {}".format(cat_accuracy, accuracy, mae))

split training and test set
seperated features and y with shapes:
(20379, 71)
(20379, 634)
inputshape is (71,)
defined model as [<keras.layers.core.Dense object at 0x7fec58245370>, <keras.layers.core.Dense object at 0x7fecf027f0a0>, <keras.layers.core.Dense object at 0x7fed3c1865b0>, <keras.layers.core.Dense object at 0x7fec580eb970>, <keras.layers.core.Dense object at 0x7fec5826a310>, <keras.layers.core.Dense object at 0x7fec5826a340>, <keras.layers.core.Dropout object at 0x7fec5826aeb0>, <keras.layers.core.Dense object at 0x7fec5826a7c0>, <keras.layers.core.Dense object at 0x7fec5826a2b0>, <keras.layers.core.Dense object at 0x7fec5283b4c0>, <keras.layers.core.Dense object at 0x7fec582668b0>, <keras.layers.core.Dense object at 0x7fec58266ac0>, <keras.layers.core.Dense object at 0x7fec58245400>, <keras.layers.core.Dense object at 0x7fec58266e80>]
compiled model
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
the accuracy on

In [74]:
#print(labels.columns)
# some pandas alchemy to sample  2 rows of each conversation
sample_df = df.sample(frac=1).reset_index(drop=True).groupby('conversation_id').apply(lambda x: x.sample(n=1)).reset_index(drop = True)
sample_df.head()

Unnamed: 0,timedelta,root_distance_0,current,beam_node,has_followed_path,has_follow_path,beam_node_author,platform,conversation_id,author,...,same_author_path_15,same_author_path_16,same_author_path_18,same_author_path_20,same_author_path_17,same_author_path_19,same_author_path_21,same_author_path_23,same_author_path_25,same_author_path_22
0,1016.0,0,1504746858278756354,1504742597352476685,0,0,2295967047,twitter,1504739306635735104,1024744255124893697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,45233.0,0,1508008254919651330,1507818534801395716,0,0,1303796126978629632,twitter,1507807320172023808,1483440154077450246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7483.0,0,1509137119863783427,1509105733568192512,0,0,712320925455273984,twitter,1509105293396951046,1482682530453143558,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3404.0,0,1510917718358306817,1510903441958318085,0,0,84103119,twitter,1510856247893008389,1439073103439704066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1992.0,0,1511062080551587849,1511053725523230724,0,0,1487561226230059019,twitter,1511051267694542858,1087438728702898177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:

sample_features = take_features(sample_df, ["author", "current_time", "beam_node_time"])
sample_features # looks like a casual sample only reveals that it is likely to get a new author close to the root

Unnamed: 0,timedelta,root_distance_0,same_author_path_1,root_distance_1,same_author_path_2,reply_distance_2,reply_distance_3,reply_distance_4,reply_distance_5,reply_distance_6,...,same_author_path_15,same_author_path_16,same_author_path_18,same_author_path_20,same_author_path_17,same_author_path_19,same_author_path_21,same_author_path_23,same_author_path_25,same_author_path_22
0,1016.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,45233.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7483.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3404.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1992.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5199.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6556.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,5732.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,38.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,996.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:

# sample_combined_set, sample_features, sample_author_one_hot = compute_new_author_column(sample_df)

# sample_features
model.predict(sample_features) # the last category has a stable 75 % probability (??)

array([[1.19090117e-07, 1.22090185e-07, 1.18401026e-07, ...,
        1.20692292e-07, 1.21316262e-07, 7.25752413e-01],
       [1.19090117e-07, 1.22090185e-07, 1.18401026e-07, ...,
        1.20692292e-07, 1.21316262e-07, 7.25752413e-01],
       [1.19090117e-07, 1.22090185e-07, 1.18401026e-07, ...,
        1.20692292e-07, 1.21316262e-07, 7.25752413e-01],
       ...,
       [1.19090117e-07, 1.22090185e-07, 1.18401026e-07, ...,
        1.20692292e-07, 1.21316262e-07, 7.25752413e-01],
       [1.19090117e-07, 1.22090185e-07, 1.18401026e-07, ...,
        1.20692292e-07, 1.21316262e-07, 7.25752413e-01],
       [1.19090117e-07, 1.22090185e-07, 1.18401026e-07, ...,
        1.20692292e-07, 1.21316262e-07, 7.25752413e-01]], dtype=float32)

#### Notes
- inserting the new author column increased precision times 10
- categorical accuracy and regular accuracy match (which is weird)