### Connected Components

In [28]:
import os
from itertools import product

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cufflinks as cf
import plotly.offline
from tqdm import tqdm

from IPython.display import display, HTML
from ipywidgets import interactive_output, HBox, VBox, Layout 
from ipywidgets import Dropdown, Checkbox, ToggleButtons, SelectionRangeSlider, SelectionSlider
from ipywidgets import SelectMultiple
import ipywidgets as widgets

cf.go_offline()
pd.set_option('display.max_columns', 100)

in_dir = 'temp_analysis'
dataset = 'twitter'
test_type = 'full'
eval_set = 'test'

result_dir = os.path.join('..', '..', in_dir, dataset, 'eval_{}'.format(eval_set), 'test_{}'.format(test_type))
df = pd.read_csv(os.path.join(result_dir, 'results.csv'))
df.head()

Unnamed: 0,num_nodes,num_msg_nodes,num_hub_nodes,num_spam_msg_nodes,num_relations,num_edges
0,396352,258434,137918,146935,3,671415
1,3,2,1,0,1,2
2,3,2,1,0,1,2
3,3,2,1,0,1,2
4,10,8,2,0,2,9


In [54]:
def process_results(df, spam_count):
    num_components = len(df)
    num_edges = df['num_edges'].sum()

    n_spam_msgs_list = []
    component_size_avg_list = []
    spam_fraction_avg_list = []
    n_components_list = []
    components_sum_list = []

    for n_spam, gf in df.groupby('num_spam_msg_nodes'):

        if n_spam < spam_count[0] or n_spam > spam_count[1]:
            continue

        if n_spam == 0:
            spam_fraction_avg = 0
        else:
            spam_fraction_avg = (gf['num_spam_msg_nodes'] / gf['num_nodes']).mean()

        component_size_avg = gf['num_nodes'].mean()
        components_sum = gf['num_nodes'].sum()

        n_spam_msgs_list.append(n_spam)
        component_size_avg_list.append(component_size_avg)
        spam_fraction_avg_list.append(spam_fraction_avg)
        n_components_list.append(len(gf))
        components_sum_list.append(components_sum)
    
    results = {}
    results['n_spam'] = n_spam_msgs_list
    results['component_size'] = component_size_avg_list
    results['spam_fraction'] = spam_fraction_avg_list
    results['n_components'] = n_components_list
    results['components_sum'] = components_sum_list

    return results

# experiment options
dataset = Dropdown(description='Dataset', options=['youtube', 'twitter', 'soundcloud'], value='youtube')
test_type = ToggleButtons(description='Test', options=['full', 'inductive'], value='full')
spam_count = SelectionRangeSlider(description='# Spam', options=[0, 1, 1000, 10000000], index=(1, 2))

# create ui
box_1 = VBox([dataset])
box_2 = VBox([test_type])
box_3 = VBox([spam_count])
ui = HBox([box_1, box_2, box_3])

# # plot graphs
def f(dataset, test_type, spam_count):
    result_dir = os.path.join('..', '..', in_dir, dataset, 'eval_{}'.format(eval_set), 'test_{}'.format(test_type))
    fp = os.path.join(result_dir, 'results.csv')

    if not os.path.exists(fp):
        plt.clf()
    
    else:
        df = pd.read_csv(os.path.join(result_dir, 'results.csv'))
    
        if len(df) == 0:
            plt.clf()

        else:

            results = process_results(df, spam_count)

            fig, axs = plt.subplots(1, 2, figsize=(15, 5))
            axs = axs.flatten()
            
            ax = axs[0]
            ax.plot(results['n_spam'], results['spam_fraction'], label='spam fraction', color='red')
            ax.set_xlabel('# spam')
            ax.set_ylabel('spam fraction')
            ax.legend()

            ax1 = ax.twinx()
            ax1.plot(results['n_spam'], results['component_size'], label='component size', color='green')
            ax1.set_ylabel('component size')
            ax1.legend()
            
            ax = axs[1]
            ax.bar(results['n_spam'], results['n_components'], label='# components', color='magenta')
            ax.set_xlabel('# spam')
            ax.set_ylabel('# components')
            ax.legend()
            
            ax1 = ax.twinx()
            ax1.plot(results['n_spam'], results['components_sum'], label='# msgs in all components', color='orange')
            ax1.set_ylabel('# msgs in all components')
            ax1.legend()
            
            plt.tight_layout()
            plt.show()

out = interactive_output(f, {'dataset': dataset, 'test_type': test_type, 'spam_count': spam_count})
display(ui, out)

HBox(children=(VBox(children=(Dropdown(description='Dataset', options=('youtube', 'twitter', 'soundcloud'), va…

Output()