In [1]:
import pandas as pd
from system_fns import ArticleEntityAnalysis
from system_fns import average_weighted_clustering_coefficient
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('business.csv')
df = df[['date', 'id', 'unique_entities']]
df['date'] = pd.to_datetime(df['date']).dt.date
df['unique_entities'] = [eval(l) for l in df['unique_entities']]
df = df.sort_values('date')
df.head()

Unnamed: 0,date,id,unique_entities
816,2023-01-01,2191765,"[Jaime Bautista, Bongbong Marcos, Manuel Tamay..."
815,2023-01-02,2191867,"[Manny Pangilinan, Jaime Bautista]"
814,2023-01-02,2192002,"[Jaime Bautista, Rodrigo Duterte, Manuel Tamay..."
811,2023-01-03,2192961,"[Bongbong Marcos, Rodrigo Duterte]"
813,2023-01-03,2192645,"[Ben Lee, Alidad Tash]"


In [3]:
print("Number of unique articles:", df['id'].nunique())
print("Earliest datetime record:", df['date'].min())
print("Latest datetime record:", df['date'].max())
df.dtypes

Number of unique articles: 817
Earliest datetime record: 2023-01-01
Latest datetime record: 2023-07-24


date               object
id                  int64
unique_entities    object
dtype: object

In [4]:
date_scope = (df['date'].max() - df['date'].min()).days
date_scope

204

In [10]:
conf = [2, 7, 14, 30, 60, 90]

clus_coefs_groups = dict() 
for i in tqdm(conf[::-1]):
    if date_scope % i == 0:
        n = date_scope // i
    else:
        n = (date_scope // i) + 1

    whole = ArticleEntityAnalysis(df)
    clus_coefs = whole.aggregate_rolling_window_analysis(
        function=average_weighted_clustering_coefficient,
        window_panel=n,
        intersect=i,
        mean=False,
        kind='entity')

    clus_coefs_groups[i] = clus_coefs

  0%|          | 0/6 [00:00<?, ?it/s]

In [11]:
# !pip install scikit-posthocs

In [12]:
from scipy import stats
import scikit_posthocs as sp

post_hoc_groups = clus_coefs_groups.copy()
significant_granularity = None

for conf in clus_coefs_groups.keys():
    if len(post_hoc_groups) >= 2:
        result = stats.kruskal(*post_hoc_groups.values())
        print(result)
        if result.pvalue < 0.05:
            print(f'{conf} days temporal granularity.')
            results = sp.posthoc_dunn([*post_hoc_groups.values()], p_adjust='bonferroni')
            significant_granularity = conf
            display(results)
        else:
            break
        
    post_hoc_groups.pop(conf)

KruskalResult(statistic=9.024856683460541, pvalue=0.10807680638521595)


In [13]:
whole.element_rolling_window_degree_analysis(
    p=5,
    window_size=7,
    kind='entity',
    plot=True,
)