In [None]:
import pickle
from random import sample
# PB: prediction based algorithm
# RB: response based algoritm


file_name_PB = "data/vision_forward_graph_data_08_09_22.pkl"
with open(file_name_PB, 'rb') as f:
    df_PB = pickle.load(f)
    #df_PB.sort_values(by="platform", inplace=True, ignore_index=True)

file_name_RB = "data/vision_graph_data_remote_23_08_22.pkl"
with open(file_name_RB, 'rb') as f:
    df_RB = pickle.load(f)

file_name_centrality = "data/author_centrality_remote.pkl"
with open(file_name_centrality, 'rb') as f:
    df_centrality = pickle.load(f)

common_conversation_ids = set(df_PB.conversation_id).intersection(df_RB.conversation_id).intersection(
    df_centrality.conversation_id)
all_conversation_ids = set(df_PB.conversation_id).union(df_RB.conversation_id).union(df_centrality.conversation_id)
all_conversation_count = len(all_conversation_ids)
common_conversation_count = len(common_conversation_ids)
# reducing the sample size for testing
common_conversation_ids = sample(common_conversation_ids, 700)

print("From {} conversations {} are shared in all datasets".format(all_conversation_count, common_conversation_count))


df_PB = df_PB[df_PB.conversation_id.isin(common_conversation_ids)]
df_RB = df_RB[df_RB.conversation_id.isin(common_conversation_ids)]
df_centrality = df_centrality[df_centrality.conversation_id.isin(common_conversation_ids)]
df_PB.shape

### Utility Functions

The following cell contains utility functions that are needed for all the different algorithms

In [None]:

from random import sample


def is_not_reddit_or_twitter(text):
    if text == "reddit" or text == "twitter":
        return False
    else:
        return True


def equalize_samples(df):
    """
    this approximates the same number of conversations for both platforms
    :param df:
    :return:
    """
    df_conversations_twitter = set(df[df["platform"] == "twitter"].conversation_id.tolist())
    df_conversations_reddit = set(df[df["platform"] == "reddit"].conversation_id.tolist())
    reddit_data_count = df.loc[df.platform == "reddit", 'platform'].count()
    twitter_data_count = df.loc[df.platform == "twitter", 'platform'].count()
    # assert twitter_data_count > reddit_data_count, "counts (reddit, twitter) are ({},{}):".format(reddit_data_count, twitter_data_count)
    current_count = 0
    n = 1
    smaller_count = reddit_data_count
    df_conversations = df_conversations_twitter
    if reddit_data_count > twitter_data_count:
        smaller_count = twitter_data_count
        df_conversations = df_conversations_reddit
    while current_count < smaller_count:
        chosen_conversation_ids = sample(df_conversations, n)
        df_candidate = df[df["conversation_id"].isin(chosen_conversation_ids)]
        n = n + 1
        current_count = df_candidate.shape[0]
    print("chosen {} conversations and gotten {} from twitter compared to {} from reddit".format(n, current_count,
                                                                                                 reddit_data_count))
    not_chosen_conversation_ids = set(df_conversations) - set(chosen_conversation_ids)
    df_result = df[~df["conversation_id"].isin(not_chosen_conversation_ids)]
    return df_result


### Non-Features

All the cells contain a number of columns that have a meaning in the conversation but are not features to train the NN with.

In [None]:

# some utility functions to take the columns that are used as features
non_feature_list = ["current", "beam_node", "conversation_id", "platform", "has_followed_path", "has_follow_path",
                    "beam_node_author", "author"]


def take_features(df, additional_non_features=[]):
    non_feature_list2 = non_feature_list + additional_non_features
    df = df.drop(non_feature_list2, axis=1)
    return df


def take_non_features(df, additional_non_features=[]):
    non_feature_list2 = non_feature_list + additional_non_features
    column_names = df.columns.values
    feature_list = [column_name for column_name in column_names if column_name not in non_feature_list2]
    df = df.drop(feature_list, axis=1)
    return df


def normalize_timedelta(df):
    # normalize timedelta (put between 0 and 1)
    dt = df.timedelta
    timedelta_normalized = (dt - dt.min()) / (dt.max() - dt.min())
    df = df.assign(timedelta=timedelta_normalized)
    return df



#### Data Cleaning and Data preperation
- Delete rows that are neither twitter or reddit data
- normalize time deltas


In [None]:
# filtering data that is not twitter or reddit
def delete_not_twitter_not_reddit(df):
    platform = df.platform
    to_delete_rows = platform.apply(lambda x: is_not_reddit_or_twitter(x))
    df = df.drop(df[to_delete_rows].index)
    return df

df_RB = delete_not_twitter_not_reddit(df_RB)
df_PB = delete_not_twitter_not_reddit(df_PB)
df_centrality = delete_not_twitter_not_reddit(df_centrality)

df_RB = equalize_samples(df_RB)
df_PB = df_PB[df_PB.conversation_id.isin(df_RB.conversation_id)]
df_centrality = df_centrality[df_centrality.conversation_id.isin(df_RB.conversation_id)]

df_RB = normalize_timedelta(df_RB)
df_PB = normalize_timedelta(df_PB)


# Baseline for Author vision
- uses selected values as a distance measure
- probability of having seen a tweet is reduced by a half with each step in the reply hierachy
- probability of having seen a tweet is reduced by a quarter for each step away from the root
- probabiliy of having seen a tweet is increased for each path in the follower network to the tweet (forthcoming)


In [None]:
# in order to allow the comparison, the filter from the other notebook needs to run and the predictions, too
df_baseline = df_RB[df_RB["conversation_id"].isin(common_conversation_ids)]

reply_filter_col = [col for col in df_baseline if col.startswith('reply')]
root_distance_filter_col = [col for col in df_baseline if col.startswith('root')]
reply_columns = df_baseline[reply_filter_col]
root_distance_columns = df_baseline[root_distance_filter_col]
reply_cs = reply_columns.sum(axis=1)
root_distance_cs = root_distance_columns.sum(axis=1)
rcs_not_null = [i for i in reply_cs.tolist() if i != 0]
root_reply_combined = (root_distance_cs + reply_cs)
root_reply_combined = (root_reply_combined - root_reply_combined.min()) / (
        root_reply_combined.max() - root_reply_combined.min())
combined = [i for i in root_reply_combined.tolist() if i != 0]
df_baseline = df_baseline.assign(root_reply_combined=root_reply_combined)

In [None]:
df_baseline_with_authors = df_baseline[["root_reply_combined", "conversation_id", "author", "platform"]]
# df_reshaped = pd.pivot_table(df_data,index=["conversation_id", "current"], columns=["root_reply_combined"],aggfunc = np.mean)
baseline_gpm = df_baseline_with_authors.groupby(["platform", "conversation_id", "author"]).mean()
baseline_predictions = baseline_gpm
baseline_predictions.rename(columns={"root_reply_combined": "baseline"},inplace=True)
baseline_predictions

In [None]:
baseline_gpm_conversation = baseline_gpm.groupby(by=["platform", "conversation_id"]).mean()
baseline_gpm = baseline_gpm_conversation.groupby(by=["platform"]).mean()
baseline_gpm


# Repetition Probabilities

#### Analyzing the probability of an author writing repeatedly in the same conversation
1. sum up the amounts y == 1 (because an author has answered himself)
2. sum chances of an author seeing himself write
3. calculate a measure of how likely it is that an author sees himself repeated as a test for the nn

In [None]:
author_count_columns = ["current", "conversation_id", "platform", "y"]
author_df = df_RB[author_count_columns]
author_df = author_df.groupby(["platform", "conversation_id", "current"]).sum()
author_df = author_df.groupby(["platform", "conversation_id"]).mean()
distinct_view_columns = ["current", "conversation_id", "platform"]
distinct_view_df = df_RB[distinct_view_columns]
distinct_views = distinct_view_df.groupby(["current", "conversation_id", "platform"]).size().to_frame('size')
distinct_views = distinct_views.groupby(["platform", "conversation_id"]).mean()
joined_author_stats = author_df.join(distinct_views)
joined_author_stats = joined_author_stats["y"] / joined_author_stats["size"]

# prepare for comparison
import pandas as pd
repetition_predictions = pd.DataFrame(joined_author_stats).rename(columns={0: "repetition"})

joined_author_stats

The joined author stats show the repetition probabilities for each of the platforms per conversation.

In [None]:
repetition_probability = joined_author_stats.groupby("platform").mean().to_frame("repetition_probs")
repetition_probability


# The Response Based Author vision Algorithm (RB)

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote

#### Loading the data from the pickled version
1. importing libraries
2. checking gpu support


In [None]:
#import modin.pandas as pd
import pandas as pd

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))

In [None]:
# df = df[df["root_distance_0"] == 0]
# analyze the distribution of reached targets for the sample
print("reddit:")
print(df_RB[df_RB["platform"] == "reddit"].y.value_counts())
print("twitter:")
print(df_RB[df_RB["platform"] == "twitter"].y.value_counts())
# this should be higher for reddit as the unique author / posting ratio is lower for reddit

### Computing a nn model
1. separate features
2. train models for reddit and twitter
3. inspect models for reddit and twitter
4. predict the likelihood based on the author has seen a posting
5. aggregate likelihoods in order to compute author vision measure


In [None]:
# training functions
def train_model(df):
    # dropping non-reddit non-twitter data
    df = take_features(df)

    # selecting train and test datasets
    train, test = train_test_split(df, test_size=0.2)
    train.describe()

    # train the model
    y = train.y
    x = train.drop("y", axis=1)
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model

    print(tf.__version__)
    input_shape = (x.shape[1],)
    model = Sequential([
        Dense(1, activation='sigmoid', input_shape=input_shape)
    ])

    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer='sgd',
        loss='binary_crossentropy',
        metrics=['accuracy', 'mae']
    )

    # model.fit(x, y, epochs=3)
    model.fit(x, y)
    # evaluate the model on the test set
    test_y = test.y
    test_x = test.drop("y", axis=1)

    loss, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

    return x, y, test_x, test_y, model


def inspect_model(x, y, test_x, test_y, model):
    # have a look at some prediction
    reply_distance_2 = test_x[test_x["reply_distance_2"] == 1]
    first_rows = reply_distance_2.head(2)
    print(first_rows)
    model.predict(first_rows)

    # let's have a look at the weights and biases of the hidden layer
    first_layer_weights = model.layers[0].get_weights()[0]
    first_layer_biases = model.layers[0].get_weights()[1]
    # print(first_layer_weights)
    column_names = x.columns.values
    for i in range(len(column_names[:5])):
        print("feature {} has weight {} \n".format(column_names[i], first_layer_weights[i]))


In [None]:
# have a look for reddit
tw_df = df_RB[df_RB["platform"] == "twitter"]
tw_x, tw_y, tw_test_x, tw_test_y, tw_model = train_model(tw_df)

rd_df = df_RB[df_RB["platform"] == "reddit"]
rd_x, rd_y, rd_test_x, rd_test_y, rd_model = train_model(rd_df)

# inspect_model(tw_x, tw_y, tw_test_x, tw_test_y, tw_model)
# inspect_model(rd_x, rd_y, rd_test_x, rd_test_y, rd_model)

tw_non_features = take_non_features(tw_df)
rd_non_features = take_non_features(rd_df)

tw_features_y = take_features(tw_df)
tw_features = tw_features_y.drop("y", axis=1)
rd_features_y = take_features(rd_df)
rd_features = rd_features_y.drop("y", axis=1)
rd_predictions = rd_model.predict(rd_features)
tw_predictions = tw_model.predict(tw_features)

tw_vision = tw_non_features.assign(predictions=tw_predictions)
rd_vision = rd_non_features.assign(predictions=rd_predictions)

combined_vision = tw_vision.append(rd_vision)
not_needed_list = ["beam_node_author", "beam_node", "has_followed_path", "has_follow_path"]
combined_vision = combined_vision.drop(not_needed_list, axis=1)
combined_vision_with_author = combined_vision
combined_vision

In [None]:
combined_vision_with_author2 = combined_vision_with_author.groupby(
    ["platform", "conversation_id", "author", "predictions"]).count()
combined_vision_with_author2 = combined_vision_with_author2.reset_index()
combined_vision_with_author2.groupby(["platform", "conversation_id", "author"]).sum()
combined_vision_with_author2[
    "avg_predictions"] = combined_vision_with_author2.predictions / combined_vision_with_author2.current
combined_vision_with_author2 = combined_vision_with_author2.drop(["current", "predictions"], axis=1)
combined_vision_with_author2 = combined_vision_with_author2.groupby(["platform", "conversation_id", "author"]).mean()
rb_result = combined_vision_with_author2.groupby(["platform", "conversation_id"]).mean()

# prepare for comparison
RB_predictions= combined_vision_with_author2.rename(columns={"avg_predictions": "RB"})

rb_result = rb_result.groupby(["platform"]).mean()
rb_result

In [None]:
combined_vision = combined_vision.drop("author", axis=1)
gpm = combined_vision.groupby(["platform", "conversation_id", "current"]).mean()
gpm_per_conversation = gpm.groupby(by=["platform", "conversation_id"]).mean()
gpm_per_platform = gpm.groupby(by=["platform"]).mean()
gpm_per_platform

In [None]:
probabilities = repetition_probability.join(gpm_per_platform)
probabilities.corr()

### Interpretation the correlation between probabilities and the RB-predictions
- This means that the neural network computes a linear function of the repetition probabilities based on the computation of the y functions
- The probabilities are very low for both reddit and twitter but in a comparable area



# Author Prediction

It is possible to predict an author or "new author" at same time by defining categories as 1 if a author is to be predicted but
only if it is not a new author. Because of memory, only twitter or reddit data can be predicted in one run.
The full dataset does not fit in laptops memory and is computed on the cluster (which in turn has no gpu support)

The probability of predicting an author is calculated for each relationship (root distance to another node, reply distance to other nodes, and reply distance to nodes with the same author. In future also the author follower network will be included in the feature set.

The overall sum of the probability of predicting an author (in average) will be interpreted as the likelihood of any author writing in any time in the conversation (again, because it is not a new author). This will then seen as the author being present in the conversation because it is another measure of a author being available in all branches and positions in the conversation.

#### Create a one hot vector representation of the possible authors
- create an artificial user that represents a new user in a conversation up to that point
- get a matrix with the authors as columns and a 1 if the author wrote the post
- join it with the feature matrix
- drop the author column


#### Training NN to predict the author that would write next
- included a "new author" category to capture predicting unknown authors
- using multi-class classification (instead of multi-label)
- relu/sigmoid activation functions have same effect
- precision grew significantly when adding more than 3-5 layers

#### Predicting the author presence based on prediction probabilities
- compute predictions for the whole dataframe
- drop features and non-features except conversation and platform
- wide to long the authors to make them a index
- groupby conversation and platform

#### Notes
- inserting the new author column increased precision times 10
- categorical accuracy and regular accuracy match (which is weird)



In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def calculate_author_predictions(df):
    # compute a fake user that symbolizes that the given user has not been seen at a given stage in the conversation
    df_conversation_authors = df[["conversation_id", "author", "current_time"]]
    first_times = df_conversation_authors.groupby(["conversation_id", "author"]).min()

    def is_new_author(row):
        earliest_author_post = first_times.loc[row["conversation_id"],row["author"]]
        current_post_time = row["current_time"]
        return  earliest_author_post >= current_post_time

    new_author_column = df[["conversation_id", "author", "current_time"]].apply(is_new_author, axis=1)
    new_author_column= new_author_column.rename(columns={'current_time':"Author_is_new"})
    new_author_column.value_counts()


    def compute_new_author_column(df):
        import pandas as pd
        author_one_hot = pd.get_dummies(df.author, prefix="Author", sparse=True)
        # make author cells 0 that are now represented as "new author"
        author_one_hot = author_one_hot.astype(bool).apply(lambda x: x & ~new_author_column.Author_is_new).astype(int)
        # delete columns that are all 0
        author_one_hot = author_one_hot.loc[:, (author_one_hot != 0).any(axis=0)]
        # join the new author column to the labels
        labels = author_one_hot.join(new_author_column.astype(int))
        features = take_features(df, ["current_time", "beam_node_time"])
        combined_set = features.join(labels)
        return combined_set, features, labels

    combined_set, features, labels = compute_new_author_column(df)

    from keras.optimizer_v2.rmsprop import RMSprop  # selecting train and test datasets
    train, test = train_test_split(combined_set, test_size=0.2, shuffle=False)
    print("split training and test set")

    # train the model
    y = train.drop(features.columns, axis=1)
    x = train.drop(labels.columns, axis=1)
    print("seperated features and y with shapes:")
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model
    # print(tf.__version__)
    input_shape = (x.shape[1],)
    output_shape = y.shape[1]
    print("inputshape is {}".format(input_shape))
    model = Sequential([
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='softmax', input_shape=input_shape)
    ])
    print("defined model as {}".format(model.layers))
    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer=RMSprop(),
        loss='categorical_crossentropy',
        metrics=['categorical_accuracy', 'accuracy' ,'mae']
    )
    print("compiled model")
    #model.fit(x, y, epochs=3)
    model.fit(x, y)
    #model.fit(x, y, epochs=10, shuffle=True)
    # evaluate the model on the test set
    test_y = test.drop(features.columns, axis=1)
    test_x = test.drop(labels.columns, axis=1)
    #test_x = test_x.drop("timedelta", axis=1)

    loss, cat_accuracy, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is cat acc {}, reg acc {} and the mae is {}".format(cat_accuracy, accuracy, mae))

    all_features = take_features(df, ["current_time", "beam_node_time"])
    print("start generating author predictions for the whole data set")
    predictions = model.predict(all_features, use_multiprocessing=True)
    print("end generating author predictions for the whole data set")
    column_names = labels.columns
    predictions = pd.DataFrame(predictions, columns=column_names)
    print(type(predictions))
    print(predictions.shape)


    all_non_features = df[["conversation_id", "platform"]]
    print(type(all_non_features))
    print(all_non_features.shape)
    all_non_features.reset_index(drop=True, inplace=True)
    joined_dataframe = all_non_features.join(predictions)
    #print(joined_dataframe.Author_is_new.describe()) # no idea why that is the same prediction of all the rows

    joined_dataframe = joined_dataframe.groupby(["platform", "conversation_id"]).mean()
    author_predictions_existing = joined_dataframe.drop(["Author_is_new"], axis=1)
    author_predictions_existing.reset_index(level=['platform', 'conversation_id'],inplace=True)
    print("start converting author hot vectors beack to one author column")
    author_predictions_existing_reshaped = pd.wide_to_long(author_predictions_existing, stubnames="Author_", i=['platform', 'conversation_id'], j="author_id")
    print("end converting author hot vectors beack to one author column")    
    return author_predictions_existing_reshaped

In [None]:
df_PB_reddit = df_PB[df_PB["platform"] == "reddit"]
prediction_result_reddit = calculate_author_predictions(df_PB_reddit)

df_PB_twitter = df_PB[df_PB["platform"] == "twitter"]
prediction_result_twitter = calculate_author_predictions(df_PB_twitter)

In [None]:
# prediction_result_reddit
# prediction_result_twitter

PB_predictions = prediction_result_reddit.append(prediction_result_twitter)
PB_predictions = PB_predictions.rename(columns={"Author_": "PB"})
PB_predictions

# Author Centrality
- The centrality was already computed when creating the dataset as it is based on graph measures primarily


In [None]:
df_centrality_avg = df_centrality.groupby(["platform", "conversation_id", "author"]).mean()
df_centrality_avg = df_centrality_avg.assign(centrality=df_centrality_avg.centrality_score / df_centrality_avg.root_distance_avg)
df_centrality_avg = df_centrality_avg.drop(["centrality_score", "root_distance_avg"], axis=1)
df_centrality_avg

# Combined Analysis
- join three author vision measures into one dataframe
- add author centrality to the same dataframe
- correlate the measures


In [None]:
prediction_comparison_table = RB_predictions.join(PB_predictions).join(df_centrality_avg).join(baseline_predictions)

prediction_comparison_table = prediction_comparison_table.groupby(["platform", "conversation_id"]).mean().join(repetition_predictions)
prediction_comparison_table

In [None]:
from matplotlib import pyplot as plt
import seaborn as sn

corr_matrix = prediction_comparison_table.corr(method="pearson")
sn.heatmap(corr_matrix, annot=True)
plt.show()

#### Data Cleaning and Data preperation
- Delete rows that are neither twitter or reddit data
- normalize time deltas


In [None]:
# filtering data that is not twitter or reddit
def delete_not_twitter_not_reddit(df):
    platform = df.platform
    to_delete_rows = platform.apply(lambda x: is_not_reddit_or_twitter(x))
    df = df.drop(df[to_delete_rows].index)
    return df

df_RB = delete_not_twitter_not_reddit(df_RB)
df_PB = delete_not_twitter_not_reddit(df_PB)
df_centrality = delete_not_twitter_not_reddit(df_centrality)

df_RB = equalize_samples(df_RB)
df_PB = df_PB[df_PB.conversation_id.isin(df_RB.conversation_id)]
df_centrality = df_centrality[df_centrality.conversation_id.isin(df_RB.conversation_id)]

df_RB = normalize_timedelta(df_RB)
df_PB = normalize_timedelta(df_PB)


# Baseline for Author vision
- uses selected values as a distance measure
- probability of having seen a tweet is reduced by a half with each step in the reply hierachy
- probability of having seen a tweet is reduced by a quarter for each step away from the root
- probabiliy of having seen a tweet is increased for each path in the follower network to the tweet (forthcoming)


In [None]:
# in order to allow the comparison, the filter from the other notebook needs to run and the predictions, too
df_baseline = df_RB[df_RB["conversation_id"].isin(common_conversation_ids)]

reply_filter_col = [col for col in df_baseline if col.startswith('reply')]
root_distance_filter_col = [col for col in df_baseline if col.startswith('root')]
reply_columns = df_baseline[reply_filter_col]
root_distance_columns = df_baseline[root_distance_filter_col]
reply_cs = reply_columns.sum(axis=1)
root_distance_cs = root_distance_columns.sum(axis=1)
rcs_not_null = [i for i in reply_cs.tolist() if i != 0]
root_reply_combined = (root_distance_cs + reply_cs)
root_reply_combined = (root_reply_combined - root_reply_combined.min()) / (
        root_reply_combined.max() - root_reply_combined.min())
combined = [i for i in root_reply_combined.tolist() if i != 0]
df_baseline = df_baseline.assign(root_reply_combined=root_reply_combined)

In [None]:
df_baseline_with_authors = df_baseline[["root_reply_combined", "conversation_id", "author", "platform"]]
# df_reshaped = pd.pivot_table(df_data,index=["conversation_id", "current"], columns=["root_reply_combined"],aggfunc = np.mean)
baseline_gpm = df_baseline_with_authors.groupby(["platform", "conversation_id", "author"]).mean()
baseline_gpm_conversation = baseline_gpm.groupby(by=["platform", "conversation_id"]).mean()
baseline_gpm = baseline_gpm_conversation.groupby(by=["platform"]).mean()
baseline_gpm


# Repetition Probabilities

#### Analyzing the probability of an author writing repeatedly in the same conversation
1. sum up the amounts y == 1 (because an author has answered himself)
2. sum chances of an author seeing himself write
3. calculate a measure of how likely it is that an author sees himself repeated as a test for the nn

In [None]:
author_count_columns = ["current", "conversation_id", "platform", "y"]
author_df = df_RB[author_count_columns]
author_df = author_df.groupby(["platform", "conversation_id", "current"]).sum()
author_df = author_df.groupby(["platform", "conversation_id"]).mean()
distinct_view_columns = ["current", "conversation_id", "platform"]
distinct_view_df = df_RB[distinct_view_columns]
distinct_views = distinct_view_df.groupby(["current", "conversation_id", "platform"]).size().to_frame('size')
distinct_views = distinct_views.groupby(["platform", "conversation_id"]).mean()
joined_author_stats = author_df.join(distinct_views)
joined_author_stats = joined_author_stats["y"] / joined_author_stats["size"]
joined_author_stats

The joined author stats show the repetition probabilities for each of the platforms per conversation.

In [None]:
repetition_probability = joined_author_stats.groupby("platform").mean().to_frame("repetition_probs")
repetition_probability


# The Response Based Author vision Algorithm (RB)

The features are the distance of the author to any tweet in the conversation
indicated by the following structures:
- subtree to viewed tweet from a tweet the author wrote
- root closeness of viewed tweet
- time delta to viewed tweet from tweets the author wrote

#### Loading the data from the pickled version
1. importing libraries
2. checking gpu support


In [None]:
#import modin.pandas as pd
import pandas as pd

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

is_cuda_gpu_available = tf.test.is_gpu_available(cuda_only=True)
print("cuda gpu is available: {}".format(is_cuda_gpu_available))

In [None]:
# df = df[df["root_distance_0"] == 0]
# analyze the distribution of reached targets for the sample
print("reddit:")
print(df_RB[df_RB["platform"] == "reddit"].y.value_counts())
print("twitter:")
print(df_RB[df_RB["platform"] == "twitter"].y.value_counts())
# this should be higher for reddit as the unique author / posting ratio is lower for reddit

### Computing a nn model
1. separate features
2. train models for reddit and twitter
3. inspect models for reddit and twitter
4. predict the likelihood based on the author has seen a posting
5. aggregate likelihoods in order to compute author vision measure


In [None]:
# training functions
def train_model(df):
    # dropping non-reddit non-twitter data
    df = take_features(df)

    # selecting train and test datasets
    train, test = train_test_split(df, test_size=0.2)
    train.describe()

    # train the model
    y = train.y
    x = train.drop("y", axis=1)
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model

    print(tf.__version__)
    input_shape = (x.shape[1],)
    model = Sequential([
        Dense(1, activation='sigmoid', input_shape=input_shape)
    ])

    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer='sgd',
        loss='binary_crossentropy',
        metrics=['accuracy', 'mae']
    )

    # model.fit(x, y, epochs=3)
    model.fit(x, y)
    # evaluate the model on the test set
    test_y = test.y
    test_x = test.drop("y", axis=1)

    loss, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is {} and the mae is {}".format(accuracy, mae))

    return x, y, test_x, test_y, model


def inspect_model(x, y, test_x, test_y, model):
    # have a look at some prediction
    reply_distance_2 = test_x[test_x["reply_distance_2"] == 1]
    first_rows = reply_distance_2.head(2)
    print(first_rows)
    model.predict(first_rows)

    # let's have a look at the weights and biases of the hidden layer
    first_layer_weights = model.layers[0].get_weights()[0]
    first_layer_biases = model.layers[0].get_weights()[1]
    # print(first_layer_weights)
    column_names = x.columns.values
    for i in range(len(column_names[:5])):
        print("feature {} has weight {} \n".format(column_names[i], first_layer_weights[i]))


In [None]:
# have a look for reddit
tw_df = df_RB[df_RB["platform"] == "twitter"]
tw_x, tw_y, tw_test_x, tw_test_y, tw_model = train_model(tw_df)

rd_df = df_RB[df_RB["platform"] == "reddit"]
rd_x, rd_y, rd_test_x, rd_test_y, rd_model = train_model(rd_df)

# inspect_model(tw_x, tw_y, tw_test_x, tw_test_y, tw_model)
# inspect_model(rd_x, rd_y, rd_test_x, rd_test_y, rd_model)

tw_non_features = take_non_features(tw_df)
rd_non_features = take_non_features(rd_df)

tw_features_y = take_features(tw_df)
tw_features = tw_features_y.drop("y", axis=1)
rd_features_y = take_features(rd_df)
rd_features = rd_features_y.drop("y", axis=1)
rd_predictions = rd_model.predict(rd_features)
tw_predictions = tw_model.predict(tw_features)

tw_vision = tw_non_features.assign(predictions=tw_predictions)
rd_vision = rd_non_features.assign(predictions=rd_predictions)

combined_vision = tw_vision.append(rd_vision)
not_needed_list = ["beam_node_author", "beam_node", "has_followed_path", "has_follow_path"]
combined_vision = combined_vision.drop(not_needed_list, axis=1)
combined_vision_with_author = combined_vision
combined_vision

In [None]:
combined_vision_with_author2 = combined_vision_with_author.groupby(
    ["platform", "conversation_id", "author", "predictions"]).count()
combined_vision_with_author2 = combined_vision_with_author2.reset_index()
combined_vision_with_author2.groupby(["platform", "conversation_id", "author"]).sum()
combined_vision_with_author2[
    "avg_predictions"] = combined_vision_with_author2.predictions / combined_vision_with_author2.current
combined_vision_with_author2 = combined_vision_with_author2.drop(["current", "predictions"], axis=1)
combined_vision_with_author2 = combined_vision_with_author2.groupby(["platform", "conversation_id", "author"]).mean()
combined_vision_with_author2 = combined_vision_with_author2.groupby(["platform", "conversation_id"]).mean()
combined_vision_with_author2 = combined_vision_with_author2.groupby(["platform"]).mean()
rb_result = combined_vision_with_author2
rb_result

In [None]:
combined_vision = combined_vision.drop("author", axis=1)
gpm = combined_vision.groupby(["platform", "conversation_id", "current"]).mean()
gpm_per_conversation = gpm.groupby(by=["platform", "conversation_id"]).mean()
gpm_per_platform = gpm.groupby(by=["platform"]).mean()
gpm_per_platform

In [None]:
probabilities = repetition_probability.join(gpm_per_platform)
probabilities.corr()

### Interpretation the correlation between probabilities and the RB-predictions
- This means that the neural network computes a linear function of the repetition probabilities based on the computation of the y functions
- The probabilities are very low for both reddit and twitter but in a comparable area



# Author Prediction

It is possible to predict an author or "new author" at same time by defining categories as 1 if a author is to be predicted but
only if it is not a new author. Because of memory, only twitter or reddit data can be predicted in one run.
The full dataset does not fit in laptops memory and is computed on the cluster (which in turn has no gpu support)

The probability of predicting an author is calculated for each relationship (root distance to another node, reply distance to other nodes, and reply distance to nodes with the same author. In future also the author follower network will be included in the feature set.

The overall sum of the probability of predicting an author (in average) will be interpreted as the likelihood of any author writing in any time in the conversation (again, because it is not a new author). This will then seen as the author being present in the conversation because it is another measure of a author being available in all branches and positions in the conversation.

#### Create a one hot vector representation of the possible authors
- create an artificial user that represents a new user in a conversation up to that point
- get a matrix with the authors as columns and a 1 if the author wrote the post
- join it with the feature matrix
- drop the author column


#### Training NN to predict the author that would write next
- included a "new author" category to capture predicting unknown authors
- using multi-class classification (instead of multi-label)
- relu/sigmoid activation functions have same effect
- precision grew significantly when adding more than 3-5 layers

#### Predicting the author presence based on prediction probabilities
- compute predictions for the whole dataframe
- drop features and non-features except conversation and platform
- wide to long the authors to make them a index
- groupby conversation and platform

#### Notes
- inserting the new author column increased precision times 10
- categorical accuracy and regular accuracy match (which is weird)



In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def calculate_author_predictions(df):
    # compute a fake user that symbolizes that the given user has not been seen at a given stage in the conversation
    df_conversation_authors = df[["conversation_id", "author", "current_time"]]
    first_times = df_conversation_authors.groupby(["conversation_id", "author"]).min()

    def is_new_author(row):
        earliest_author_post = first_times.loc[row["conversation_id"],row["author"]]
        current_post_time = row["current_time"]
        return  earliest_author_post >= current_post_time

    new_author_column = df[["conversation_id", "author", "current_time"]].apply(is_new_author, axis=1)
    new_author_column= new_author_column.rename(columns={'current_time':"Author_is_new"})
    new_author_column.value_counts()


    def compute_new_author_column(df):
        import pandas as pd
        author_one_hot = pd.get_dummies(df.author, prefix="Author", sparse=True)
        # make author cells 0 that are now represented as "new author"
        author_one_hot = author_one_hot.astype(bool).apply(lambda x: x & ~new_author_column.Author_is_new).astype(int)
        # delete columns that are all 0
        author_one_hot = author_one_hot.loc[:, (author_one_hot != 0).any(axis=0)]
        # join the new author column to the labels
        labels = author_one_hot.join(new_author_column.astype(int))
        features = take_features(df, ["current_time", "beam_node_time"])
        combined_set = features.join(labels)
        return combined_set, features, labels

    combined_set, features, labels = compute_new_author_column(df)

    from keras.optimizer_v2.rmsprop import RMSprop  # selecting train and test datasets
    train, test = train_test_split(combined_set, test_size=0.2, shuffle=False)
    print("split training and test set")

    # train the model
    y = train.drop(features.columns, axis=1)
    x = train.drop(labels.columns, axis=1)
    print("seperated features and y with shapes:")
    print(x.shape)
    print(y.shape)

    # import tensorflow and train the model
    # print(tf.__version__)
    input_shape = (x.shape[1],)
    output_shape = y.shape[1]
    print("inputshape is {}".format(input_shape))
    model = Sequential([
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='relu', input_shape=input_shape),
        Dense(output_shape, activation='softmax', input_shape=input_shape)
    ])
    print("defined model as {}".format(model.layers))
    # stochastic gradient descend as a classifier seem appropriate
    model.compile(
        optimizer=RMSprop(),
        loss='categorical_crossentropy',
        metrics=['categorical_accuracy', 'accuracy' ,'mae']
    )
    print("compiled model")
    #model.fit(x, y, epochs=3)
    model.fit(x, y)
    #model.fit(x, y, epochs=10, shuffle=True)
    # evaluate the model on the test set
    test_y = test.drop(features.columns, axis=1)
    test_x = test.drop(labels.columns, axis=1)
    #test_x = test_x.drop("timedelta", axis=1)

    loss, cat_accuracy, accuracy, mae = model.evaluate(test_x, test_y)
    print("the accuracy on the training set is cat acc {}, reg acc {} and the mae is {}".format(cat_accuracy, accuracy, mae))

    all_features = take_features(df, ["current_time", "beam_node_time"])
    print("start generating author predictions for the whole data set")
    predictions = model.predict(all_features, use_multiprocessing=True)
    print("end generating author predictions for the whole data set")
    column_names = labels.columns
    predictions = pd.DataFrame(predictions, columns=column_names)
    print(type(predictions))
    print(predictions.shape)


    all_non_features = df[["conversation_id", "platform"]]
    print(type(all_non_features))
    print(all_non_features.shape)
    all_non_features.reset_index(drop=True, inplace=True)
    joined_dataframe = all_non_features.join(predictions)
    #print(joined_dataframe.Author_is_new.describe()) # no idea why that is the same prediction of all the rows

    joined_dataframe = joined_dataframe.groupby(["platform", "conversation_id"]).mean()
    author_predictions_existing = joined_dataframe.drop(["Author_is_new"], axis=1)
    author_predictions_existing.reset_index(level=['platform', 'conversation_id'],inplace=True)
    print("start converting author hot vectors beack to one author column")
    author_predictions_existing_reshaped = pd.wide_to_long(author_predictions_existing, stubnames="Author_", i=['platform', 'conversation_id'], j="author_id")
    print("end converting author hot vectors beack to one author column")    
    return joined_dataframe, author_predictions_existing_reshaped

In [None]:
df_PB_reddit = df_PB[df_PB["platform"] == "reddit"]
prediction_result_reddit, prediction_result_reddit_existing = calculate_author_predictions(df_PB_reddit)

df_PB_twitter = df_PB[df_PB["platform"] == "twitter"]
prediction_result_twitter, prediction_result_twitter_existing = calculate_author_predictions(df_PB_twitter)