Loading everything

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import scipy.sparse as sp
import numpy as np
from codecs import open
import networkx as nx
import community
import random
from time import time

In [2]:
DATA_FOLDER = '/home/teven/fake_news/Wikidumps/'
RAW_FOLDER = os.path.join(DATA_FOLDER, 'raw/')
LINKS_FILE = os.path.join(DATA_FOLDER, '2110_symmetrized_links.json')
FINAL_INDEX_FILE = os.path.join(DATA_FOLDER, '2110_final_index.json')
ADJACENCY_MATRIX = os.path.join(DATA_FOLDER, '2110_conflict_adjacency.txt.npz')
MATRIX_INDEX = os.path.join(DATA_FOLDER, '2110_conflict_matrix_index.txt')
TALK_DATA_FILE = os.path.join(DATA_FOLDER, '2110_talk_data.json')
CONTROVERSY_SCORES = os.path.join(DATA_FOLDER, '2110_controversy_scores.txt')
REVERSE_INDEX_FILE = os.path.join(DATA_FOLDER, '2110_reverse_index.json')
EDIT_WARS = os.path.join(DATA_FOLDER, 'edit_wars.json')

Controversy metric clustering

In [3]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    return sp.csr_matrix(([max(0, value - 0.1) for value in loader['data']], loader['indices'], loader['indptr']), shape=loader['shape'])

In [4]:
adjacency_matrix = load_sparse_csr(ADJACENCY_MATRIX)
adjacency_matrix.eliminate_zeros()
print(adjacency_matrix.shape)
print(len(adjacency_matrix.data))
g = nx.from_scipy_sparse_matrix(adjacency_matrix)
index = json.load(open(MATRIX_INDEX, 'r', encoding='utf-8'), encoding='utf8')
clustering = community.best_partition(g)

(33287, 33287)
2630905


In [5]:
reverse_index = json.load(open(REVERSE_INDEX_FILE, 'r', encoding='utf8'), encoding='utf8')
matrix_index = json.load(open(MATRIX_INDEX, 'r', encoding='utf8'), encoding='utf8')

In [7]:
cluster_range = set(clustering.values())
clusters = [[] for _ in cluster_range]
for article, cluster_number in clustering.items():
    clusters[cluster_number].append(reverse_index[str(matrix_index[article])])
clusters.sort(key=len, reverse=True)

In [8]:
i = 205
print(len(clusters[i]))
print(clusters[i])

25
['Deforestation', 'Lolita', 'Japanese sword', 'Tang dynasty', "Rosie O'Donnell", 'Baa, Baa, Black Sheep', 'Women and children first', 'Nicky Morgan', 'Chloroform', 'Ninjatō', 'Superstition', 'Upstate New York', 'Peter Scott (educationalist)', 'ChucK', 'Primeval', 'Marxism', 'Teddington', 'Kathie Lee Gifford', 'WALL-E', 'Zac Goldsmith', 'Differential heat treatment', 'Kingston University', 'A-League transfers for 2015–16 season', 'Tom Holland (actor)', 'Masonic lodge officers']


In [9]:
nx.set_node_attributes(g, {node: random.random() for node in g.nodes()}, "controversy")
nx.set_node_attributes(g, {node: random.randint(0, 9) for node in g.nodes()}, "cluster")
nx.set_node_attributes(g, {node: reverse_index[str(matrix_index[node])] for node in g.nodes()}, "title")

nx.write_gpickle(g, "sample_graph.pickle")

Edit wars clustering

In [11]:
edit_wars = json.load(open(EDIT_WARS))
print ('done')
wars_index = json.load(open(FINAL_INDEX_FILE))
print ('done')
wars_index = {war: wars_index.get(war, None) for war in edit_wars}
print ('done')
wars_index = {k: v for k, v in wars_index.items() if v is not None}
print ('done')
wars_links = json.load(open(LINKS_FILE))
print ('done')
wars_links = {war: wars_links.get(str(index), None) for war, index in wars_index.items()}
print ('done')
wars_links = {k: v for k, v in wars_links.items() if v is not None}
print ('done')
edit_wars = {k: v for k, v in edit_wars.items() if k in wars_index}

done
done
done
done
done
done
done


In [12]:
g = nx.Graph()
g.add_nodes_from(sorted(wars_index.values()))
print(len(g.nodes()))
print(len(wars_index))
nx.set_node_attributes(g, name="title", values={v: k for k, v in wars_index.items()})
nx.set_node_attributes(g, name="controversy_value", values={wars_index[k]: v for k, v in edit_wars.items()})

12539
13043


In [13]:
all_edges = []
start_time = time()
nodes_set = set(wars_index.values())
for i, node in enumerate(g.nodes(data=True)):
    try:
        title = node[1]['title']
    except:
        print(node)
    edges = [(node[0], child) for child in wars_links[title] if child in nodes_set]
    all_edges.extend(edges)
    if i % 10 == 0:
        print(i)
        print(time() - start_time)

g.add_edges_from(all_edges)

0
0.004122734069824219
10
0.00970458984375
20
0.014175653457641602
30
0.016061067581176758
40
0.018225908279418945
50
0.01915884017944336
60
0.020593643188476562
70
0.021491527557373047
80
0.022924423217773438
90
0.02330493927001953
100
0.03144097328186035
110
0.03607058525085449
120
0.03684186935424805
130
0.0386502742767334
140
0.03929710388183594
150
0.04078054428100586
160
0.04241442680358887
170
0.04761242866516113
180
0.05049395561218262
190
0.05139732360839844
200
0.05281639099121094
210
0.05357074737548828
220
0.056647539138793945
230
0.05729484558105469
240
0.05858325958251953
250
0.05949234962463379
260
0.05989265441894531
270
0.06124114990234375
280
0.06477236747741699
290
0.06653237342834473
300
0.0669100284576416
310
0.06754589080810547
320
0.06834077835083008
330
0.06951236724853516
340
0.07019829750061035
350
0.07591056823730469
360
0.07848501205444336
370
0.07898736000061035
380
0.07997560501098633
390
0.08121061325073242
400
0.08390474319458008
410
0.08610391616821289


9570
0.7593388557434082
9580
0.759570837020874
9590
0.7603390216827393
9600
0.7606840133666992
9610
0.7609555721282959
9620
0.761253833770752
9630
0.7613699436187744
9640
0.7621660232543945
9650
0.7623648643493652
9660
0.7628061771392822
9670
0.7633264064788818
9680
0.7638731002807617
9690
0.7640378475189209
9700
0.7646195888519287
9710
0.7654116153717041
9720
0.7660198211669922
9730
0.766512393951416
9740
0.7666492462158203
9750
0.7675166130065918
9760
0.76767897605896
9770
0.7679052352905273
9780
0.7681393623352051
9790
0.7684111595153809
9800
0.7685606479644775
9810
0.7689173221588135
9820
0.7691366672515869
9830
0.7692615985870361
9840
0.7695248126983643
9850
0.7700107097625732
9860
0.7702956199645996
9870
0.7704999446868896
9880
0.7707040309906006
9890
0.7708842754364014
9900
0.7712454795837402
9910
0.7716653347015381
9920
0.7718656063079834
9930
0.7720842361450195
9940
0.7725040912628174
9950
0.7728815078735352
9960
0.7733838558197021
9970
0.7738902568817139
9980
0.77426791191101

In [14]:
g.size()

133192

In [15]:
clustering = community.best_partition(g)
print(len(set(clustering.values())))
nx.set_node_attributes(g, name='cluster', values=clustering)

612


In [16]:
title_clusters = [set() for _ in range(len(set(clustering.values())))]
node_clusters = [set() for _ in range(len(set(clustering.values())))]
for i, node in enumerate(g.nodes(data=True)):
    title = node[1]['title']
    cluster = node[1]['cluster']
    title_clusters[cluster].add(title)
    node_clusters[cluster].add(node[0])
title_clusters.sort(key=len)
node_clusters.sort(key=len)

In [18]:
h = g.subgraph([node for node in node_clusters[-1]])

h.size()

smaller_clustering = community.best_partition(h)
print(len(set(smaller_clustering.values())))

smaller_title_clusters = [set() for _ in range(len(set(smaller_clustering.values())))]
smaller_node_clusters = [set() for _ in range(len(set(smaller_clustering.values())))]
for i, node in enumerate(h.nodes(data=True)):
    title = node[1]['title']
    cluster = smaller_clustering[node[0]]
    smaller_title_clusters[cluster].add(title)
    smaller_node_clusters[cluster].add(node[0])
smaller_title_clusters.sort(key=len)
smaller_node_clusters.sort(key=len)

9


In [19]:
smaller_title_clusters[-1]
len(smaller_title_clusters[-1])

515

In [20]:
i = g.subgraph([node for node in smaller_node_clusters[-1]])

i.size()

even_smaller_clustering = community.best_partition(i)
print(len(set(even_smaller_clustering.values())))

even_smaller_title_clusters = [set() for _ in range(len(set(even_smaller_clustering.values())))]
even_smaller_node_clusters = [set() for _ in range(len(set(even_smaller_clustering.values())))]
for i, node in enumerate(i.nodes(data=True)):
    title = node[1]['title']
    cluster = even_smaller_clustering[node[0]]
    even_smaller_title_clusters[cluster].add(title)
    even_smaller_node_clusters[cluster].add(node[0])
even_smaller_title_clusters.sort(key=len)
even_smaller_node_clusters.sort(key=len)

8


In [26]:
even_smaller_title_clusters[-3]

{'100 metres',
 '14 On Fire',
 '2005 civil unrest in France',
 '2009–10 Iranian election protests',
 '2014–15 RB Leipzig season',
 '2015 Thalys attack',
 '2016–17 S.L. Benfica season',
 'Aeroflot destinations',
 'Alexander Archipenko',
 'Alsace',
 'Ansaldo STS',
 'Battle of Ligny',
 'Belcourt Castle',
 'Berlin',
 'Bethmanns and Rothschilds',
 'Bob Shaheen',
 'Bogdan Bogdanović (basketball)',
 'Bordeaux',
 'Bosnian pyramids',
 'Brussels',
 'Coverage of Google Street View',
 'Defence of the Reich',
 'Deniz Aytekin',
 'Domain of Soissons',
 'Dominique Strauss-Kahn',
 'Economy of Paris',
 'Eleanor Elkins Widener',
 'Esin Afşar',
 'EuroBasket',
 'European route E80',
 'Foie gras',
 'Fortune Global 500',
 'France',
 'Frédéric Chopin',
 'Gallaudet University',
 'Gare de Lyon-Part-Dieu',
 'George Remus',
 'Global city',
 'Haut-Rhin',
 'History of French',
 'History of Paris',
 'History of Trams',
 'Immanuel Velikovsky',
 'Islamic terrorism in Europe (2014–present)',
 'Jonas Valančiūnas',
 'Lab

In [22]:
final_node_clusters = node_clusters[:-1] + smaller_node_clusters[:-1] + even_smaller_node_clusters
final_title_clusters = title_clusters[:-1] + smaller_title_clusters[:-1] + even_smaller_title_clusters
final_node_clusters.sort(key=len, reverse=True)
final_title_clusters.sort(key=len, reverse=True)

In [23]:
final_clustering = {}
for i, node_cluster in enumerate(final_node_clusters):
    for node in node_cluster:
        final_clustering[node] = i
nx.set_node_attributes(g, name='cluster', values=final_clustering)

In [24]:
nx.write_gpickle(g, "final_graph.pickle")

Dump of the graph visualization

In [25]:
"""graph_dict = {"graph": [],
             "links": [],
             "nodes": [],
             "directed": False,
             "multigraph": False}

def cluster_score(cluster_number):
    total_number = len(final_node_clusters)
    return ((cluster_number * 107) % total_number) /float(total_number)

node_index = {node: i for i, node in enumerate(g.nodes())}
indexed_edges = [{"source": node_index[edge[0]], "target": node_index[edge[1]]} for edge in g.edges]
indexed_nodes = [{"size": node[1]["controversy_value"], "score": cluster_score(node[1]["cluster"]), "type": "circle", "id": node[1]["title"]} for node in g.nodes(data=True)]

graph_dict["links"] = indexed_edges
graph_dict["nodes"] = indexed_nodes

json.dump(graph_dict, open("d3graph.json", "w"), indent=2)
print(len(final_node_clusters))

final_title_clusters[450]"""