In [None]:
import functools
import numpy as np
import pandas as pd
from scipy.sparse import csr_array, triu, save_npz, load_npz

In [None]:
# Get load tsv file, tab seperated
tsv = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep='\t')

# Replace Nans with empty string (some of the column values are empty, and get filled with Nans)
tsv = tsv.replace(np.nan, " ")

# Remove unnecesary columns [added index, and history column]
behaviors = tsv.drop(tsv.columns[[0, 2]], axis=1)


# Give columns names
behaviors = behaviors.set_axis(["ID", "Prior", "History"], axis=1)

# Preview DF
behaviors.head()

In [None]:
# Remove the U in the ID strings and change to int (assumes all strings start with u)
behaviors["ID"] = behaviors["ID"].apply(lambda x: int(x[1:]))

In [None]:
# Get all unique user IDs
users = behaviors["ID"].unique()

# Reindex the IDs
behaviors["ID"] = behaviors["ID"].replace(users, np.arange(users.size))

# Split strings of impressions in to arrays
behaviors["Prior"] = behaviors["Prior"].apply(lambda x: x.split(" "))
behaviors["History"] = behaviors["History"].apply(lambda x: x.split(" "))

In [None]:
# Group rows by ID and combine the values in the rows with the same ID
df = behaviors.groupby(behaviors['ID']).aggregate(sum)

In [None]:
# Preview DF
df.head()

In [None]:
import re


def parse_history(history, mode=None):
    if mode is not None:
        # Get all the items that end in mode, if mode is passed
        *history, = filter(lambda x: int(x[-1]) == mode, history)

    # use regex to get numbers only
    return [int(re.sub("[^0-9]", "", h)) for h in history]


df["clicked"] = df["History"].apply(lambda x: parse_history(x, mode=0))
df["viewed"] = df["History"].apply(lambda x: parse_history(x, mode=1))

In [None]:
# Preview DF
df.head()

In [None]:

def reindex_column(header, items={}, return_items=False):
    # For loop builds a dictionary of all articles, and gives them a unique index
    for c in header:
        for x in c:
            if x not in items.keys():
                items[x] = len(items.keys())

    # Re-index the articles with the new index
    *reindex, = header.apply(lambda x: [items[c] for c in x])

    if return_items:
        reindex = (reindex, items)
    return reindex


df["clicked_re"], items = reindex_column(df.clicked, return_items=True)
df["viewed_re"] = reindex_column(df.viewed, items=items)

In [None]:
# Preview DF
df.head()

In [None]:
n, m = len(users), len(items)
adj = np.zeros((n, m), dtype=np.int16)
for i, x in df.iterrows():
    # Fill in a 1 for the index where an item was clicked
    adj[i][x['clicked_re']] = 1

# Make Sparse Matrix
sa = csr_array(adj)

# Matrix Multiplication to get Adjacency Graph (this takes ~50sec on my computer)
graph = sa @ sa.T

In [None]:
# Keep everything above the main diagnol (so that edges aren't included twice) [took me ~2mins]
upper = csr_array(triu(graph, k=1))

In [None]:
save_npz("user_user_matrix", upper)

In [None]:
upper.size

In [None]:
count_one = upper.size
count_one / ((upper.shape[0] * upper.shape[1]) / 2)

In [None]:
count_two = upper.data[upper.data >= 10].size
count_two / ((upper.shape[0] * upper.shape[1]) / 2)

In [4]:
import scipy.sparse as sp
import matplotlib.pyplot as plt

In [None]:
flat_array = upper.data

plt.hist(flat_array, bins=range(min(flat_array), 100), edgecolor='black')
plt.xlabel('Shared Articles (Count)')
plt.ylabel('Frequency')
plt.title('Distribution of Shared Articles')
plt.show()

In [None]:
def graph_coverage():

    unique_values = range(1, 100)
    normalized_frequencies = []

    for value in unique_values:
        count = upper.data[upper.data >= value].size
        freq = count / ((upper.shape[0] * upper.shape[1]) / 2)
        normalized_frequencies.append(freq)
        print(
            f"Value: {value}, Normalized Frequency: {freq:.4f}, Actual Frequency: {count}")

    # Plot the bar chart
    plt.bar(unique_values, normalized_frequencies, edgecolor='black')
    plt.title('Normalized Frequencies of Unique Values')
    plt.xlabel('Value')
    plt.ylabel('Normalized Frequency')
    plt.show()


graph_coverage()

In [7]:
import networkx as nx

In [8]:
upper = load_npz('user_user_matrix.npz')

In [21]:
filtered_rows, filtered_cols = upper.nonzero()

In [23]:
def calc_modularity(threshold, user_user_matrix):

    mask = user_user_matrix.data >= threshold
    # print(mask)
    # filtered_rows, filtered_cols = user_user_matrix.nonzero()
    filtered_rows_m = filtered_rows[mask]
    filtered_cols_m = filtered_cols[mask]
    filtered_data = user_user_matrix.data[mask]

    percentage = filtered_data.size / ((upper.shape[0] * upper.shape[1]) / 2)
    print(percentage)

    print(f'making graph with threshold {threshold}')
    # Create a graph from the filtered data
    G = nx.Graph()
    # edge_data = []

    # Iterate over the filtered data and add edges
    for i, j, w in zip(filtered_rows_m, filtered_cols_m, filtered_data):
        if i != j:  # Exclude self-loops
            G.add_edge(i, j, weight=w)
            # edge_data.append((i, j, w))

    # # df_edges = pd.DataFrame(edge_data, columns=['Source', 'Target', 'Weight'])

    print('now community detection')

    # # compute the best partition
    # communities_generator = nx.community.louvain_communities(G)
    # partition = {node: i for i, comm in enumerate(
    #     communities_generator) for node in comm}
    # communities_list = [comm for comm in communities_generator]

    # # return communities_list

    # modularity_score = nx.community.modularity(G, communities_list)
    # coverage, performance = nx.community.partition_quality(G, communities_list)
    # # size = sum(1 for community in communities_list if len(community) < 50)
    # # small_percentage = size / len(communities_list)

    # print("modularity:", modularity_score)
    # print("coverage:", coverage)
    # print("performance:", performance)
    # print("percentage", percentage)

    # community_sizes = [len(community)/51281 for community in communities_list]
    # print(community_sizes)
    # print('total:', sum(community_sizes)/51281)
    # print('size percentage:', small_percentage)

    # return modularity_score, coverage, performance, percentage

In [None]:
mod_score, coverage, performance, percentage = calc_modularity(
    threshold=5, user_user_matrix=upper)

In [None]:
mod = []
cov = []
perf = []
perc = []

thrs = range(1, 2)
for i in thrs:
    mod_score, coverage, performance, percentage = calc_modularity(
        threshold=i, user_user_matrix=upper[0:10])
    mod.append(mod_score)
    cov.append(coverage)
    perf.append(performance)
    perc.append(percentage)


plt.plot(thrs, mod, label='y = modularity', color='blue')
plt.plot(thrs, cov, label='y = coverage', color='red')
plt.plot(thrs, perf, label='y = performance', color='green')
plt.plot(thrs, perc, label='y = percentage of users', color='orange')

plt.title("Threshold vs Metrics")
plt.xlabel("threshold")
plt.legend()

plt.show()