In [2]:
# imports
import pandas as pd
import re
import seeds.country_list as eu_nordic_countries

### A. Demographic areas

We are using the onto-notes tag since the `MISC` tag of the regular tags is too general. We are interested in the `GPE` and `NORP` tags of the onto-notes tags.

In [9]:
# loading the onto-notes tags
ner_onto_df = pd.read_parquet("data/ner_tagged_data_onto.parquet")
ner_onto_df.head()

Unnamed: 0,message_ids,text,label
0,1,OSINT,ORG
1,1,Cyberknow20,PERSON
2,1,pro-Russian,NORP
3,2,Today,DATE
4,2,Poland,GPE


In [20]:
# filtering out all the GPE and NORP tags 
tags_of_interest = ["GPE", "NORP"]
tag_mask = ner_onto_df['label'].isin(tags_of_interest)
filtered_df = ner_onto_df[tag_mask]

# selecting the text column of the filtered df
text_set = set(filtered_df["text"])

# lower-casing all the elements of the set
lowercase_set = {word.lower() for word in text_set}

# retrieving the countries that are contained within the defined set of countries 
attacked_countries = [word for word in lowercase_set if word in eu_nordic_countries]

# turning the countries into capital case 
capital_case_countries = [country.capitalize() for country in attacked_countries]

# printing the resulting list of countries in alphabetical order
print(sorted(capital_case_countries))

# some quick summary statistics
total_countries = len(eu_nordic_countries)
number_attacked = len(attacked_countries)
perc_attacked = round(number_attacked / total_countries * 100, 2)
print(f"{number_attacked} out of {total_countries} EU and Nordic countries have been targeted")
print(f"This corresponds to {perc_attacked}% of all the EU and Nordic countries")

['Austria', 'Belgium', 'Croatia', 'Cyprus', 'Czech republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden']
28 out of 28 EU and Nordic countries have been targeted
This corresponds to 100.0% of all the EU and Nordic countries


### B. Infrastructure sectors

In [21]:
# filtering out all the GPE and NORP tags 
tags_of_interest = ["ORG"]
tag_mask = ner_onto_df['label'].isin(tags_of_interest)
filtered_df = ner_onto_df[tag_mask]

# selecting the text column of the filtered df
text_set = set(filtered_df["text"])
text_set

{'Polski Koncern Naftowy ORLEN SA',
 'Credito Emiliano',
 'Canadian Bankers Association',
 "Army.Let's",
 'the Technical Research Institute of Sweden',
 'Keymous',
 'the Central Bank of Italy',
 'SJ AB',
 'the Supreme Court of the Czech Republic',
 'the Ministry of Foreign Affairs of Italy',
 'Mint',
 'the Canadian Parliament',
 'Moldovan Ministry',
 'the Czech Republic',
 'Tele2 - Mobile',
 'Enercoop',
 'the Italian Transport Regulation Authority',
 'Information Technologies',
 'Hackathon',
 'Socibus',
 'Ministry of National Defense',
 'Danish Ministry of Taxes',
 'the General Directorate of Public Finance',
 'ZBlackHat',
 'the Liechtenstein Tax Administration',
 'the Latvian Navy Kite',
 'Savings Bank',
 'Zssk',
 'Gestapo',
 "the President's Office",
 'Cargobooking',
 'Malmö City Hall',
 'Law and Justice',
 'Guardian.For',
 "PostNord Denmark's",
 'the Barcelona Arbitration Tribunal',
 'DataSafe',
 'the Prague Integrated Transport',
 'TIM RAIL CARGO SRL',
 "Defense Forces'",
 'Grenche

### C. Security properties (CIA)

In [3]:
df = pd.read_csv("data/hacktivist_messages.csv", sep=";")
# pd.set_option('display.max_colwidth', None)
# df[["Text"]].sample(50)

In [17]:
# Regular expression pattern to match
pattern = re.compile(r'❌https:\/\/check-host', re.UNICODE)

# Filter rows where 'text' column contains the pattern
filtered_df = df[~df['Text'].str.contains(pattern, regex=True)]

print(filtered_df)

TypeError: bad operand type for unary ~: 'float'