In [1]:
import ujson 
import operator
import os
from collections import *
import glob
import pandas as pd
import re
import networkx as nx
from tqdm import tqdm

In [None]:
os.chdir("/shared/1/projects/cross-lingual-exchange/data/named_entities/UK_DE")

en_entities = {}
de_entities = {}

for file in tqdm(glob.glob('*.json')):
    with open(file, 'r') as f:
        try:
            data = ujson.load(f)

            if os.path.splitext(file)[0][-2:] == 'en':
                for entity, count in data.items():
                    entity = re.sub(r'@#[^ ]+', ' ', entity).strip()
                    entity = entity.encode('ascii', 'ignore').decode('ascii')
                    if len(entity) >= 5 and entity[0:2] != 'RT' and entity[0] != '@' and entity != '😂😂😂' and entity != '😂 #' and entity != '😂😂😂😂' and entity != '👏👏👏':
                        if entity in en_entities:
                            en_entities[entity] += 1
                        else:
                            en_entities[entity] = 1

            if os.path.splitext(file)[0][-2:] == 'de':
                for entity, count in data.items():
                    entity = re.sub(r'@[^ ]+', ' ', entity).strip()
                    if len(entity) >= 5 and entity[0:2] != 'RT' and entity[0] != '@':
                        if entity in de_entities:
                            de_entities[entity] += 1
                        else:
                            de_entities[entity] = 1  
        except:
            pass


factor_en = 1.0/sum(en_entities.values())
factor_de = 1.0/sum(de_entities.values())

en_entities_freq = {}
de_entities_freq = {}

for k in en_entities:
  en_entities_freq[k] = en_entities[k]*factor_en

for k in de_entities:
    de_entities_freq[k] = de_entities[k]*factor_de


 45%|████▍     | 810/1804 [16:05<23:03,  1.39s/it]

In [None]:
en_set = set(en_entities)
de_set = set(de_entities)

In [None]:
sorted_en = sorted(en_entities.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_en[0:100])

In [None]:
sorted_de = sorted(de_entities.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_de[0:100])

In [None]:
intersection = de_set.intersection(en_set)
intersection_dict = {}

for key in intersection:
    intersection_dict[key] = en_entities[key] + de_entities[key]

sorted_intersection = sorted(intersection_dict.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_intersection[0:200])

In [9]:
entity_list = [entity[0] for entity in sorted_intersection 
               if en_entities[entity[0]] >= 200 and de_entities[entity[0]] >= 200]
print(entity_list)

['Danke', '&lt;3', 'Hahaha', 'https://t', 'Brexit', 'Follow', 'Hahahahaha', 'Trump', 'Hahahaha', 'Check', 'Liverpool', 'England', 'Great', 'Cheers', 'Wales', 'Retweet', 'Labour', 'Chelsea', 'Twitter', 'London', 'Jesus', 'Scotland', 'https://t.c', 'Hahahahahaha', 'Leeds', 'Facebook', 'Hahahah', 'Messi', 'YouTube', 'Sorry', 'Merci', 'https:/', 'Tickets', 'Arsenal', 'NowPlaying', 'Netflix', 'Watch', 'James', 'Bravo', 'Hallo', 'Manchester', 'Berlin', 'Amazon', 'Barcelona', 'Video', 'Photo', 'Europe', 'Trumps', 'Salah', 'Spurs', 'Spotify', 'Germany', 'David', 'France', 'JIMIN', 'Glasgow', 'Google', 'Instagram', 'Paris', 'Bonjour', 'Loooool', 'Hahahahahahaha', 'Ronaldo', 'Looool', 'Super', 'ICYMI', 'China', 'Gracias', 'Birmingham', 'McDonalds', 'United', 'Japan', 'SoundCloud', 'Cardiff', 'Today', 'Chris', 'Frankfurt', 'Hamburg', 'Manchester United', 'Yessss', 'Christmas', 'Apple', 'Italy', 'Genius', 'StopBrexit', 'Diolch', 'America', 'Rangers', 'Man City', 'Nazis', 'Amsterdam', 'Michael', 'V

In [12]:
with open('top_entities.txt', 'w+') as f:
    for entity in entity_list:
        f.write(entity + '\n')