# Author Prediction
- checking gpu stats and limiting gpu memory
- importing dataset from pickle

In [1]:


import nvidia_smi

nvidia_smi.nvmlInit()

handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
# card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate

info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)

print("Total memory:", info.total)
print("Free memory:", info.free)
print("Used memory:", info.used)

nvidia_smi.nvmlShutdown()

Total memory: 4104323072
Free memory: 4099145728
Used memory: 5177344


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K
import pickle5 as pickle

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
    try:
        tf.config.set_logical_device_configuration(
            gpus[0],
            [tf.config.LogicalDeviceConfiguration(memory_limit=2024)])
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Virtual devices must be set before GPUs have been initialized
        print(e)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))

#file_name = "data/vision_forward_graph_data_local_05_08_22.pkl"
file_name = "data/vision_forward_graph_data_08_09_22.pkl"
with open(file_name, 'rb') as f:
    df = pickle.load(f)

df.shape

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
1 Physical GPUs, 1 Logical GPUs
cuda gpu is available: True


2022-08-08 12:07:50.155690: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-08 12:07:50.257627: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-08 12:07:50.290045: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-08 12:07:50.290354: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

(3419848, 127)

In [3]:
# importing utility functions
%run author_vision_util.ipynb

In [4]:
df = equalize_samples(df)
df = df[df["platform"] == "twitter"]
df.shape

chosen 531 conversations and gotten 541146 from twitter compared to 514793 from reddit


(541091, 127)

#### Create a one hot vector representation of the possible authors
- create an artificial user that represents a new user in a conversation up to that point
- get a matrix with the authors as columns and a 1 if the author wrote the post
- join it with the feature matrix
- drop the author column


In [5]:
# compute a fake user that symbolizes that the given user has not been seen at a given stage in the conversation
df_conversation_authors = df[["conversation_id", "author", "current_time"]]
first_times = df_conversation_authors.groupby(["conversation_id", "author"]).min()

def is_new_author(row):
    earliest_author_post = first_times.loc[row["conversation_id"],row["author"]]
    current_post_time = row["current_time"]
    return  earliest_author_post >= current_post_time

new_author_column = df[["conversation_id", "author", "current_time"]].apply(is_new_author, axis=1)
new_author_column= new_author_column.rename(columns={'current_time':"Author_is_new"})
#new_author_column.describe()
# current author has not been the beam_node
new_author_column.value_counts()

Author_is_new
True             444673
False             96418
dtype: int64

In [None]:
def compute_new_author_column(df):
    author_one_hot = pd.get_dummies(df.author, prefix="Author", sparse=True)
    author_one_hot = author_one_hot.astype(bool).apply(lambda x: x & ~new_author_column.Author_is_new).astype(int)
    labels = author_one_hot.join(new_author_column.astype(int))
    features = take_features(df, ["author", "current_time", "beam_node_time"])
    combined_set = features.join(labels)
    return combined_set, features, labels

combined_set, features, labels = compute_new_author_column(df)
# combined_set.head()

#### Training NN to predict the author that would write next
- included a "new author" category to capture predicting unknown authors
- using multi-class classification (instead of multi-label)
- relu/sigmoid activation functions have same effect
- precision grew significantly when adding more than 3-5 layers

In [None]:
from keras.layers import Dropout
from keras.optimizer_v2.rmsprop import RMSprop  # selecting train and test datasets
train, test = train_test_split(combined_set, test_size=0.2, shuffle=True)
print("split training and test set")

# train the model
y = train.drop(features.columns, axis=1)
x = train.drop(labels.columns, axis=1)
print("seperated features and y with shapes:")
print(x.shape)
print(y.shape)

# import tensorflow and train the model
# print(tf.__version__)
input_shape = (x.shape[1],)
output_shape = y.shape[1]
print("inputshape is {}".format(input_shape))
model = Sequential([
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dropout(0.2),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='sigmoid', input_shape=input_shape),
    Dense(output_shape, activation='softmax', input_shape=input_shape)
])
print("defined model as {}".format(model.layers))
# stochastic gradient descend as a classifier seem appropriate
model.compile(
    optimizer=RMSprop(),
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy', 'accuracy' ,'mae']
)
print("compiled model")
model.fit(x, y, epochs=3)
#model.fit(x, y, epochs=10, shuffle=True)
# evaluate the model on the test set
test_y = test.drop(features.columns, axis=1)
test_x = test.drop(labels.columns, axis=1)

loss, cat_accuracy, accuracy, mae = model.evaluate(test_x, test_y)
print("the accuracy on the training set is cat acc {}, reg acc {} and the mae is {}".format(cat_accuracy, accuracy, mae))

In [None]:
import numpy as np

#print(labels.columns)
# some pandas alchemy to sample  2 rows of each conversation
sample_df = df.sample(frac=1).reset_index(drop=True).groupby('conversation_id').apply(lambda x: x.sample(n=1)).reset_index(drop = True)
sample_features = take_features(sample_df, ["author", "current_time", "beam_node_time"])
sample_prediction = model.predict(sample_features)
np.matrix(sample_prediction)[0:5, -1] # the last row is the "new author column" label and should contain a high value

#### Predicting the author presence based on prediction probabilities
- compute predictions for the whole dataframe
- drop features and non-features except conversation and platform
- wide to long the authors to make them a index
- groupby conversation and platform

In [None]:
all_features = take_features(df, ["author", "current_time", "beam_node_time"])
predictions = model.predict(all_features)
column_names = labels.columns
predictions = pd.DataFrame(predictions, columns=column_names)
print(type(predictions))
print(predictions.shape)

In [None]:
all_non_features = take_non_features(df)
print(type(all_non_features))
print(all_non_features.shape)
all_non_features.reset_index(drop=True, inplace=True)
joined_dataframe = all_non_features.join(predictions)
not_needed_list = ["beam_node", "has_followed_path", "has_follow_path", "beam_node_author", "current"]
author_predictions = joined_dataframe.drop(not_needed_list, axis=1)
author_predictions.groupby(["platform", "conversation_id"]).mean()
author_predictions["id"] = author_predictions.index

In [None]:
author_predictions.Author_is_new.describe() # no idea why that is the same prediction of all the rows

In [None]:
author_predictions_existing = author_predictions.drop(["Author_is_new"], axis=1)
author_predictions_existing_reshaped = pd.wide_to_long(author_predictions_existing, stubnames="Author_", i=["platform", "conversation_id", "id"], j="author_id")
author_predictions_existing_reshaped.head(3)


In [None]:
avg_author_pred = author_predictions_existing_reshaped.groupby(["platform", "conversation_id", "author_id"]).mean()
avg_author_pred.head(3)

In [None]:
avg_conversation_pred  = avg_author_pred.groupby(["platform", "conversation_id"]).mean()
avg_conversation_pred.head(3)


In [None]:
avg_platform_pred = avg_conversation_pred.groupby(["platform"]).mean()
avg_platform_pred # picking the correct author seems to be exceedingly difficult

#### Notes
- inserting the new author column increased precision times 10
- categorical accuracy and regular accuracy match (which is weird)