In [1]:
import requests, json
import os
import sqlite3
from multiprocessing import Pool
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

from settings import DATA_FOLDER

DATABASE_PATH = os.path.join(DATA_FOLDER, 'WikiDB.db')
TOXICITY_THRESHOLD = .25

In [2]:
conn = sqlite3.connect(DATABASE_PATH)
cur = conn.cursor()
clusters = json.load(open(os.path.join(DATA_FOLDER, 'final_clusters_0218.json')))

balkan_cluster = clusters[3]

In [3]:
comments = []
placeholder= '?'
placeholders= ', '.join(placeholder for item in balkan_cluster)
query = 'SELECT name, title, date, text, toxicity FROM comment INNER JOIN user ON comment.user_id = user.id INNER JOIN article ON comment.article_id = article.id WHERE title IN (%s) ORDER BY name' % placeholders
comment_iterator = cur.execute(query, balkan_cluster)
for comment in comment_iterator:
    comments.append(comment)

In [4]:
len(comments)

56553

In [5]:
print(balkan_cluster)

['Greater Albania', 'Kingdom of Serbia', 'Dayton, Ohio', 'Cabinet of Zoran Milanović', 'Franjo Tuđman', 'Illyria', 'Kosovo: Can You Imagine?', 'Croats', 'Slobodan Praljak', 'Seka Aleksić', 'Bosnian Genocide', 'Čelebići prison camp', 'Jovano Jovanke', 'Teki Dervishi', 'Kruševo', 'Radoviš', 'Bileća', 'NEWBORN', 'Račak incident', 'Poglavnik', 'Montenegro', 'Tourism in Croatia', 'Ante Starcevic', 'Borovo Selo raid', 'Albanians', 'Serbs of Kosovo', 'Giacomo Micaglia', 'War crimes in the Kosovo War', 'Fan S. Noli', 'Gostivar', 'On Genocide and Crimes Against Humanity Committed in Albania during the Communist Regime for Political, Ideological and Religious Motives', 'Battle of Košare', 'University of Zagreb', 'Srebrenica massacre', 'Gračanica, Kosovo', 'Greater Croatia', 'Albanian language', 'Yugoslavs', 'Independent Macedonia (1944)', 'Adem Ljajić', 'Epoka University', '2011 Kosovo–Serbia border clashes', 'Origin of the Albanians', 'Boris Malagurski', 'Lepa Brena', 'Kingdom of Syrmia', 'Jadw

In [6]:
user_count = Counter([comment[0] for comment in comments])

In [7]:
print([user for user in user_count.keys() if user_count[user] > 500])

['Dbachmann', 'Nikola Smolenski', 'FkpCascais', 'Antidiskriminator', 'DIREKTOR', 'Opbeith', 'PaxEquilibrium', 'Fairview360', 'Osli73']


In [39]:
print(sorted(list(user_count.values()), reverse=True)[100])
print(sum(sorted(list(user_count.values()), reverse=True)[:100]))
print(len(list(user_count.values())))
print(sorted(list(user_count.values()), reverse=True))

108
28005
6244
[2175, 856, 841, 722, 664, 639, 636, 535, 529, 486, 477, 453, 443, 439, 439, 413, 404, 403, 398, 397, 389, 381, 373, 372, 370, 349, 345, 342, 321, 320, 316, 309, 276, 276, 254, 246, 246, 245, 243, 231, 227, 225, 225, 216, 215, 209, 207, 205, 201, 196, 183, 183, 183, 182, 180, 178, 176, 176, 175, 175, 175, 170, 169, 162, 161, 160, 157, 153, 152, 150, 149, 149, 148, 147, 147, 145, 145, 143, 142, 140, 134, 133, 128, 128, 127, 127, 127, 126, 125, 125, 125, 124, 122, 121, 121, 120, 118, 115, 115, 110, 108, 108, 107, 106, 105, 104, 102, 102, 101, 101, 99, 99, 99, 95, 94, 93, 92, 90, 89, 88, 88, 88, 87, 87, 86, 86, 85, 83, 83, 82, 82, 81, 80, 78, 76, 75, 75, 74, 74, 74, 73, 73, 73, 71, 71, 71, 70, 69, 69, 69, 69, 69, 69, 68, 68, 67, 67, 66, 65, 65, 64, 63, 63, 63, 62, 62, 62, 62, 61, 61, 60, 60, 59, 59, 58, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55, 54, 54, 54, 53, 52, 52, 51, 51, 51, 51, 51, 50, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 46, 46, 46, 4

In [41]:
users = list(user_count.keys())

In [46]:
current_user = comments[0][0]
comment_dict = {user: [] for user in users}
for row in comments:
    if row[0] != current_user:
        current_user = row[0]
        comment_dict[current_user].append(row)
    else:
        comment_dict[current_user].append(row)

In [47]:
nums_articles = [len(set(row[1] for row in user_comments)) for user_comments in comment_dict.values()]
nums_articles.sort(reverse=True)

In [50]:
print(nums_articles[100])
print(nums_articles)

8
[54, 43, 43, 41, 41, 40, 37, 35, 32, 30, 30, 27, 26, 26, 25, 25, 24, 24, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4