# FEL Graph Creation

This notebook creates the graph-tool graph and data files from processed FEL annotations. Annotations are in a CSV file with each line representing a different tweet. A first list shows the linked entities, while a second list the corresponding confidence in each entity.

In [None]:
from graph_tool import Graph
import graph_tool.all as gt
import pandas as pd
from itertools import combinations
from matplotlib import pyplot as plt
import numpy as np
from collections import Counter
from itertools import zip_longest
import ast
import pickle

In [None]:
fel_df = pd.read_csv('fel_entities_all_processed_plus_conf.zip', compression = 'bz2')
fel_df.entity_names = fel_df.apply(lambda x: ast.literal_eval(x['entity_names']), axis = 1)
fel_df.ed_conf = fel_df.apply(lambda x: ast.literal_eval(x['ed_conf']), axis = 1)
fel_df.ed_conf = fel_df['ed_conf'].apply(lambda x : pd.to_numeric(x))

In [None]:
def collect_entities(entity_array):
    entity_count = dict()

    for entity_list in entity_array:
        entities = []

        for entity_name in entity_list:

            if entity_name not in entities:
                entities.append(entity_name)
                if entity_name in entity_count:
                    entity_count[entity_name] += 1
                else:
                    entity_count[entity_name] = 1

    return entity_count

def count_entity_pairs(entity_array, entity_int_mapping):
    entity_pairs_counts = dict()

    for entity_list in entity_array:
        entities = []
        for entity_name in set(entity_list):
            
            if (entity_name not in entities):
                entities.append(entity_name)
        
        # counts are tracked for how often keywords occur together
        for entity_pair in list(combinations(entities, 2)):
            entity_0 = entity_int_mapping[entity_pair[0]]
            entity_1 = entity_int_mapping[entity_pair[1]]
            if frozenset((entity_0, entity_1)) in entity_pairs_counts:
                entity_pairs_counts[frozenset((entity_0, entity_1))] += 1
            else:
                entity_pairs_counts[frozenset((entity_0, entity_1))] = 1      
    return entity_pairs_counts

def create_graph(entity_array):
    entity_count = collect_entities(entity_array)
    entity_int_mapping = dict(zip(list(entity_count.keys()),[i for i in range(len(entity_count))]))
    entity_pairs = count_entity_pairs(entity_array, entity_int_mapping)

    num_nodes = len(entity_count)
    g = Graph(directed=False)
    vlist = g.add_vertex(n=num_nodes)
    g.add_edge_list([tuple(list(x) + [y]) for x,y in entity_pairs.items()], eprops=[('weight','int')])

    return g, entity_count, entity_int_mapping

In [None]:
def get_conn_comp(g, entity_int_mapping):
    con_g = gt.extract_largest_component(g, prune = False)
    vertex_arr = con_g.get_vertices()

    org_reverse_entity_mapping = dict((v,k) for k,v in entity_int_mapping.items())
    reverse_entity_mapping = {i:org_reverse_entity_mapping[i] for i in vertex_arr}
    reverse_entity_mapping = dict(zip([i for i in range(len(vertex_arr))], [org_reverse_entity_mapping[i] for i in vertex_arr]))
    
    con_g = gt.extract_largest_component(g, prune = True)

    return con_g, reverse_entity_mapping

In [None]:
full_graphs_dict = dict()

for conf in [-3,-2.75,-2.5,-2.25,-2,-1.75,-1.5,-1.25,-1]:
    entity_array = []
    for i in range(len(fel_df)):
        entity_array.append(np.array(fel_df.entity_names[i])[np.where(np.array(fel_df.ed_conf[i])>conf)[0]])
    full_graphs_dict[str(conf)] = create_graph(entity_array)
    print(str(conf))

In [None]:
conf_list = ['-3','-2.75','-2.5','-2.25','-2','-1.75','-1.5','-1.25','-1']

In [None]:
for conf in conf_list:
    full_graphs_dict[conf][0].save("fel_all_full_conf_" + conf[1:] + ".gt.gz")
    
    with open('fel_all_full_conf_' + conf[1:] + '_data.pickle', 'wb') as handle:
        pickle.dump(full_graphs_dict[conf][1:], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
conn_graphs_dict = dict()

for conf in conf_list:
    conn_graphs_dict[conf] = get_conn_comp(full_graphs_dict[conf][0], full_graphs_dict[conf][2])

In [None]:
for conf in conf_list:
    conn_graphs_dict[conf][0].save("fel_all_conn_conf_" + conf[1:] + ".gt.gz")
    
    with open('fel_all_conn_conf_' + conf[1:] + '_data.pickle', 'wb') as handle:
        pickle.dump(conn_graphs_dict[conf][1:], handle, protocol=pickle.HIGHEST_PROTOCOL)