In [30]:
# imports
import pandas as pd
import re
import data_utils as du

### A. Demographic areas

We are using the onto-notes tag since the `MISC` tag of the regular tags is too general. We are interested in the `GPE` and `NORP` tags of the onto-notes tags.

In [9]:
# loading the onto-notes tags
ner_onto_df = pd.read_parquet("data/ner_tagged_data_onto.parquet")
ner_onto_df.head()

Unnamed: 0,message_ids,text,label
0,1,OSINT,ORG
1,1,Cyberknow20,PERSON
2,1,pro-Russian,NORP
3,2,Today,DATE
4,2,Poland,GPE


In [20]:
# filtering out all the GPE and NORP tags 
tags_of_interest = ["GPE", "NORP"]
tag_mask = ner_onto_df['label'].isin(tags_of_interest)
filtered_df = ner_onto_df[tag_mask]

# selecting the text column of the filtered df
text_set = set(filtered_df["text"])

# lower-casing all the elements of the set
lowercase_set = {word.lower() for word in text_set}

# retrieving the countries that are contained within the defined set of countries 
attacked_countries = [word for word in lowercase_set if word in eu_nordic_countries]

# turning the countries into capital case 
capital_case_countries = [country.capitalize() for country in attacked_countries]

# printing the resulting list of countries in alphabetical order
print(sorted(capital_case_countries))

# some quick summary statistics
total_countries = len(eu_nordic_countries)
number_attacked = len(attacked_countries)
perc_attacked = round(number_attacked / total_countries * 100, 2)
print(f"{number_attacked} out of {total_countries} EU and Nordic countries have been targeted")
print(f"This corresponds to {perc_attacked}% of all the EU and Nordic countries")

['Austria', 'Belgium', 'Croatia', 'Cyprus', 'Czech republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden']
28 out of 28 EU and Nordic countries have been targeted
This corresponds to 100.0% of all the EU and Nordic countries


### B. Infrastructure sectors

In [32]:
# filtering out all the GPE and NORP tags 
tags_of_interest = ["ORG"]
tag_mask = ner_onto_df['label'].isin(tags_of_interest)
filtered_df = ner_onto_df[tag_mask]

# selecting the text column of the filtered df
text_set = set(filtered_df["text"])

# function to categorize organization
def categorize_organization(name):
    for sector, pattern in du.sectors_patterns.items():
        if re.search(pattern, name, re.IGNORECASE):
            return sector
    return 'Unknown'

# dictionary to keep track of counts
sector_counts = {sector: 0 for sector in du.sectors_patterns.keys()}
sector_counts["Unknown"] = 0

# assigning organizations to a sector
for org in text_set:
    sector = categorize_organization(org)
    sector_counts[sector] += 1

sector_counts

{'energy': 28,
 'transport': 69,
 'banking': 106,
 'financial market infrastructure': 7,
 'health': 0,
 'drinking water': 0,
 'waste water': 0,
 'digital infrastructure': 9,
 'public administration': 15,
 'space': 4,
 'food': 4,
 'Unknown': 1826}

### C. Security properties (CIA)

In [3]:
df = pd.read_csv("data/hacktivist_messages.csv", sep=";")
# pd.set_option('display.max_colwidth', None)
# df[["Text"]].sample(50)

In [17]:
# Regular expression pattern to match
pattern = re.compile(r'❌https:\/\/check-host', re.UNICODE)

# Filter rows where 'text' column contains the pattern
filtered_df = df[~df['Text'].str.contains(pattern, regex=True)]

print(filtered_df)

TypeError: bad operand type for unary ~: 'float'