In [3]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
hacktivist_messages = pd.read_csv("data/hacktivist_messages.csv", sep=";")
hacktivist_messages = hacktivist_messages.dropna()

In [4]:
from flair.data import Sentence
from flair.models import SequenceTagger
import math

# load tagger
tagger = SequenceTagger.load("flair/ner-english-fast")

tagged_texts = []
for row in tqdm(hacktivist_messages.iterrows(), total=hacktivist_messages.shape[0]):
    text = Sentence(row[1]["Text"])
    tagger.predict(text)
    tagged_texts.append(text)

hacktivist_messages["NER"] = tagged_texts

2024-09-04 14:46:14,081 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


100%|██████████| 2995/2995 [18:06<00:00,  2.76it/s]


In [5]:
tagger_ontonotes = SequenceTagger.load("ner-ontonotes-fast")

tagged_texts_ontonotes = []
for row in tqdm(hacktivist_messages.iterrows(), total=hacktivist_messages.shape[0]):
    text = Sentence(row[1]["Text"])
    tagger_ontonotes.predict(text)
    tagged_texts_ontonotes.append(text)

hacktivist_messages["NER_onto"] = tagged_texts_ontonotes

2024-09-04 15:04:25,420 SequenceTagger predicts: Dictionary with 75 tags: O, S-PERSON, B-PERSON, E-PERSON, I-PERSON, S-GPE, B-GPE, E-GPE, I-GPE, S-ORG, B-ORG, E-ORG, I-ORG, S-DATE, B-DATE, E-DATE, I-DATE, S-CARDINAL, B-CARDINAL, E-CARDINAL, I-CARDINAL, S-NORP, B-NORP, E-NORP, I-NORP, S-MONEY, B-MONEY, E-MONEY, I-MONEY, S-PERCENT, B-PERCENT, E-PERCENT, I-PERCENT, S-ORDINAL, B-ORDINAL, E-ORDINAL, I-ORDINAL, S-LOC, B-LOC, E-LOC, I-LOC, S-TIME, B-TIME, E-TIME, I-TIME, S-WORK_OF_ART, B-WORK_OF_ART, E-WORK_OF_ART, I-WORK_OF_ART, S-FAC


100%|██████████| 2995/2995 [17:44<00:00,  2.81it/s]


In [6]:
message_ids = []
labels = []
label_texts = []

for row in hacktivist_messages.iterrows():
    sentence = row[1]["NER"]
    message_id = row[1]["Message Id"]
    for label in sentence.get_labels():
        message_ids.append(message_id)
        labels.append(label.value)
        label_texts.append(label.data_point.text)

ner_tagged_data = pd.DataFrame({
    "message_ids": message_ids, 
    "text": label_texts,
    "label":labels
})

In [7]:
message_ids = []
labels = []
label_texts = []

for row in hacktivist_messages.iterrows():
    sentence = row[1]["NER_onto"]
    message_id = row[1]["Message Id"]
    for label in sentence.get_labels():
        message_ids.append(message_id)
        labels.append(label.value)
        label_texts.append(label.data_point.text)

ner_tagged_data_onto = pd.DataFrame({
    "message_ids": message_ids, 
    "text": label_texts,
    "label":labels
})

In [8]:
ner_tagged_data_onto

Unnamed: 0,message_ids,text,label
0,1,OSINT,ORG
1,1,Cyberknow20,PERSON
2,1,pro-Russian,NORP
3,2,Today,DATE
4,2,Poland,GPE
...,...,...,...
14052,3003,Russia,GPE
14053,3003,Russian,NORP
14054,3003,DDoSia Project|Reserve,ORG
14055,3004,Russian,NORP


In [14]:
ner_tagged_data_onto.label.unique()

array(['ORG', 'PERSON', 'NORP', 'DATE', 'GPE', 'LOC', 'TIME', 'CARDINAL',
       'EVENT', 'LANGUAGE', 'ORDINAL', 'FAC', 'WORK_OF_ART', 'PRODUCT',
       'QUANTITY', 'MONEY', 'PERCENT', 'LAW'], dtype=object)

In [15]:
sorted(ner_tagged_data_onto[(ner_tagged_data_onto["label"] == 'NORP') |
                            (ner_tagged_data_onto["label"] == 'GPE') |
                            (ner_tagged_data_onto["label"] == 'LOC')
                            ]["text"].unique())


['Aarhus',
 'Africa',
 'Agder',
 'Aichi Prefecture',
 'Albania',
 'Albanian',
 'Albion',
 'Aleksandrov Kuyavsky',
 'Algeciras',
 'Alicante',
 'America',
 'American',
 'Americans',
 'Amis',
 'Amsterdam',
 'Anglo-Saxon',
 'Anglo-Saxons',
 'Argentina',
 'Armenia',
 'Asia',
 'Asturian',
 'Asturias',
 'Athens',
 'Atlantic',
 'Australia',
 'Australian',
 'Austria',
 'Austrian',
 'AzzaSec',
 'Balearic',
 'Baltic',
 'Baltic Anti-Fascists',
 'Baltic Sea',
 'Baltics',
 'Baltics.',
 'Balts',
 'Bandera',
 'Bandera.',
 'Bandera.We',
 'Banderaites',
 'Banderite',
 'Banderites',
 'Banskobystrický kraj',
 'Banská Bystrica',
 'Barcelona',
 'Basel',
 'Basovka',
 'Basque',
 'Bayreuth',
 'Belarus',
 'Belarusian',
 'Belarusians',
 'Belgian',
 'Belgians',
 'Belgium',
 'Belgorod',
 'Bellinzona',
 'Belovody',
 'Bergamo',
 'Berlin',
 'Bielefeld',
 'Bilbao',
 'Birmingham',
 'Black Sea',
 'Bogota',
 'Bologna',
 'Bolzano',
 'Bordeaux',
 'Borel',
 'Brazil',
 'Bremen',
 'Brighton',
 'Britain',
 'British',
 'British

In [19]:
sorted(ner_tagged_data_onto[(ner_tagged_data_onto["label"] == 'ORG')]["text"].unique())


['"A1 Telekom Austria Group"',
 '"Banca Monte dei Paschi di Siena"',
 '"Banca Popolare di Bari"',
 '"Banca Popolare di Sondrio"',
 '"CARS OLIVIER"',
 '"Catalonia Hotels & Resorts"',
 '"Clearing House"',
 '"Creand Wealth Management"',
 '"Credit Agricole Bank"',
 '"Delfi Estonia"',
 '"East Japan Railway Co"',
 '"Expansión"🔥Resource',
 '"Grifs" LLC',
 '"Ignitis gamyba"',
 '"Industrial Bank"',
 '"Lithuanian Airports"',
 '"Lvovoblenergo"',
 '"Majestic Hotel Group"',
 '"PCC-Cert"',
 '"PKP Polskie Linie Kolejowe S.A."',
 '"Special Innovative Technologies"',
 '"System Capital Management',
 '"Ukrhydroenergo"',
 '"Velta"',
 '"Zhytomyr Regional Energy Supply Company"',
 '11fd9d4dk4ac❎Italian National Association of Financial Planners',
 '143972a8ka5a❌Financial Group',
 '14f8071ck51a❌ Bank of Finland',
 '1522c082k56Helsinki Region Transport',
 '176ab60cke6f  ❌Committee on Energy and Housing and Communal Services',
 '199b916dkac❌Committee on Energy and Housing and Communal Services',
 '19cc90b6k4bf

In [9]:
ner_tagged_data_onto.to_parquet("data/ner_tagged_data_onto.parquet")