In [55]:
import pickle
import networkx as nx
import pandas as pd
import numpy as np
import powerlaw
import matplotlib.pyplot as plt
from stats import graph_stats
import backboning as bb

In [14]:
def backbone_pipepline(filepath, threshold, is_directed = False):
    with open(filepath, "rb") as f:
        G = pickle.load(f)

    edge_data = [
    {'src': u, 'trg': v, 'nij': d['weight']}
    for u, v, d in G.edges(data=True)]

    df = pd.DataFrame(edge_data)

    if is_directed:
        disparity_applied = bb.disparity_filter(df, undirected = False)
    else:
        disparity_applied = bb.disparity_filter(df, undirected = True)

    thresh_applied = bb.thresholding(disparity_applied, threshold=threshold).drop(columns=["score"])

    if is_directed:
        backbone = nx.from_pandas_edgelist(
        thresh_applied,
        source="src",
        target="trg",
        edge_attr="nij",
        create_using=nx.DiGraph())
    else:
        backbone = nx.from_pandas_edgelist(
        thresh_applied,
        source="src",
        target="trg",
        edge_attr="nij",
        create_using=nx.Graph())
    
    return backbone

In [56]:
hashtags_to_remove = set([
    'hashtag', 'fyp', 'foryou', 'foryoupage', 'fypシ', 'viral', 'love',
    'trending', 'tiktok', 'funny', 'fypage', 'capcut', 'duet', 'news',
    'foryourpage', 'fy', 'fypシ゚viral', 'follow', 'viralvideo', 'like',
    'trend', 'stitch', 'video', 'lol', 'instagram', 'asmr', 'explorepage',
    'instagood', 'viraltiktok', 'youtube', 'share', 'new', '2023', 'reels',
    'followme', 'vlog', 'satisfying', 'viralvideos', 'wow', 'funnyvideos',
    'repost', 'relatable', 'followforfollowback', 'breakingnews', 'storytime',
    'tiktokfamous', 'greenscreenvideo', 'for', 'foru', 'tiktoktrend', 'goviral',
    'bhfyp', 'viralpost', 'f', 'tiktoker', 'fypp', 'fyppppppppppppppppppppppp',
    'tiktokviral', '4upage', 'forupage', '4you', 'xyzabc', 'xyzcba', '4u', 'xyzbca', 'trendy', 'oh', 'ohno', 'relatable', 'bhfyp', 'trending', '2023', 'follow', 'explorepage', 'like', 'viral', 'tiktok', 'fybシ', 'usa_tiktok',
    'foruyou', 'trends', 'fybpage', 'trendiing', 'forupage', 'fyb', 'foryourpage', 'foryoupage', 'viralvideo', 'fyou', 'foryou', '4u', '4you', 'pageforyou', 'fyp', 'series', 'fdseite', 'fypage',
    'fyoupage', 'fds', '4upage', 'tiktokfanpage', '4youpage', 'fürdich', 'fyoupagetiktok', 'viralllllll', 'dancetrends', 'dancetrend', 'duet'
])

# Hashtag coocurence

In [50]:
hashtag_path = '../../shared-folder-gald/data/unipartite_og.pkl'

In [51]:
hashtag_backbone = backbone_pipepline(hashtag_path, threshold=0.999, is_directed=False)

Calculating DF score...
  table_sum = table.groupby(table["src"]).sum().reset_index()
  table = table.drop("edge", 1)
  table = table.drop("score_min", 1)
  table = table.drop("variance_max", 1)


In [52]:
HC_stats = graph_stats(hashtag_backbone)

Calculating best minimal value for power law fit
xmin progress: 99%

Assuming nested distributions


In [53]:
for key, val in HC_stats.items():
    print(key,'-', val)

num_nodes - 39665
num_edges - 478445
density - 0.0006082162902529744
mean_degree - 24.124290936593976
std_degree - 96.52941039059701
global_clustering_coeff - 0.13371313064145485
degree_assortativity - -0.07075999525796825
GCC_size - 38505
power_law_alpha - 2.918318315988534
power_law_xmin - 387.0
ks_test_statistic - 0.025464531027676096
best_powerlaw_comparison - power law vs truncated_power_law: R = -0.128, p = 0.612


In [54]:
with open('../ready_networks/cooc_filtered.pkl', 'wb') as f:
    pickle.dump(hashtag_backbone, f)

# Follow

In [22]:
follow_path = '../../shared-folder-gald/data/follow_graph.pkl'

In [35]:
follow_backbone = backbone_pipepline(follow_path, threshold=0.90, is_directed=True)

Calculating DF score...
  table_sum = table.groupby(table["src"]).sum().reset_index()


In [36]:
follow_stats = graph_stats(follow_backbone)

Calculating best minimal value for power law fit
xmin progress: 99%

  CDF = CDF/norm
'nan' in fit cumulative distribution values.
Likely underflow or overflow error: the optimal fit for this distribution gives values that are so extreme that we lack the numerical precision to calculate them.
Assuming nested distributions


In [37]:
for key, val in follow_stats.items():
    print(key,'-', val)

num_nodes - 26627
num_edges - 402510
density - 0.0005677387124915953
mean_degree - 30.233221917602435
std_degree - 179.07905239759276
global_clustering_coeff - 0.28544694735273846
degree_assortativity - -0.27833885023649185
power_law_alpha - 2.090055669932052
power_law_xmin - 20.0
ks_test_statistic - 0.01337570266014021
best_powerlaw_comparison - power law vs truncated_power_law: R = -1.382, p = 0.096


In [48]:
with open('follow_filtered.pkl', 'wb') as f:
    pickle.dump(follow_backbone, f)

# Duet/Stitch filtered hashtags

In [61]:
with open('../ready_networks/ds_network.pkl', 'rb') as f:
    ds = pickle.load(f)

ds.remove_nodes_from(hashtags_to_remove)

with open('../ready_networks/ds_network.pkl', 'wb') as f:
    pickle.dump(ds, f)

# Likes filtered hashtags

In [62]:
with open('../ready_networks/likes_network.pkl', 'rb') as f:
    likes = pickle.load(f)

likes.remove_nodes_from(hashtags_to_remove)

with open('../ready_networks/likes_network.pkl', 'wb') as f:
    pickle.dump(likes, f)