In [1]:
# imports
import pandas as pd
import re
import data_utils as du

### A. Demographic areas

We are using the onto-notes tag since the `MISC` tag of the regular tags is too general. We are interested in the `GPE` and `NORP` tags of the onto-notes tags.

In [9]:
# loading the onto-notes tags
ner_onto_df = pd.read_parquet("data/ner_tagged_data_onto.parquet")
ner_onto_df.head()

Unnamed: 0,message_ids,text,label
0,1,OSINT,ORG
1,1,Cyberknow20,PERSON
2,1,pro-Russian,NORP
3,2,Today,DATE
4,2,Poland,GPE


In [20]:
# filtering out all the GPE and NORP tags 
tags_of_interest = ["GPE", "NORP"]
tag_mask = ner_onto_df['label'].isin(tags_of_interest)
filtered_df = ner_onto_df[tag_mask]

# selecting the text column of the filtered df
text_set = set(filtered_df["text"])

# lower-casing all the elements of the set
lowercase_set = {word.lower() for word in text_set}

# retrieving the countries that are contained within the defined set of countries 
attacked_countries = [word for word in lowercase_set if word in eu_nordic_countries]

# turning the countries into capital case 
capital_case_countries = [country.capitalize() for country in attacked_countries]

# printing the resulting list of countries in alphabetical order
print(sorted(capital_case_countries))

# some quick summary statistics
total_countries = len(eu_nordic_countries)
number_attacked = len(attacked_countries)
perc_attacked = round(number_attacked / total_countries * 100, 2)
print(f"{number_attacked} out of {total_countries} EU and Nordic countries have been targeted")
print(f"This corresponds to {perc_attacked}% of all the EU and Nordic countries")

['Austria', 'Belgium', 'Croatia', 'Cyprus', 'Czech republic', 'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden']
28 out of 28 EU and Nordic countries have been targeted
This corresponds to 100.0% of all the EU and Nordic countries


### B. Infrastructure sectors

In [32]:
# filtering out all the GPE and NORP tags 
tags_of_interest = ["ORG"]
tag_mask = ner_onto_df['label'].isin(tags_of_interest)
filtered_df = ner_onto_df[tag_mask]

# selecting the text column of the filtered df
text_set = set(filtered_df["text"])

# function to categorize organization
def categorize_organization(name):
    for sector, pattern in du.sectors_patterns.items():
        if re.search(pattern, name, re.IGNORECASE):
            return sector
    return 'Unknown'

# dictionary to keep track of counts
sector_counts = {sector: 0 for sector in du.sectors_patterns.keys()}
sector_counts["Unknown"] = 0

# assigning organizations to a sector
for org in text_set:
    sector = categorize_organization(org)
    sector_counts[sector] += 1

sector_counts

{'energy': 28,
 'transport': 69,
 'banking': 106,
 'financial market infrastructure': 7,
 'health': 0,
 'drinking water': 0,
 'waste water': 0,
 'digital infrastructure': 9,
 'public administration': 15,
 'space': 4,
 'food': 4,
 'Unknown': 1826}

### C. Security properties (CIA)

In [4]:
# loading in the dataset and viewing some messages containing redundant information in the end
df = pd.read_csv("data/hacktivist_messages.csv", sep=";")
pd.set_option('display.max_colwidth', None)
df[130:140]

Unnamed: 0,Message Id,Datetime,Text
130,131,2022-12-21 19:12:25,The Latvian portal of the financial intelligence service is not working still🔥❌https://check-host.net/check-report/df61e8dk343🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!
131,132,2022-12-22 11:02:56,🔥 Since yesterday the authorization service of the portal of grant projects of the State Agency for the Development of Education of Latvia haven't rehabilitated 🇱🇻 :❌ https://check-host.net/check-report/df78a8fk3ba🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!
132,133,2022-12-23 11:07:07,"🔥Ziedot, a Latvian Russophobic charitable organization, started collecting donations to the Armed Forces of Ukraine, but we quickly reacted and the portal stopped working due to our DDoS attacks:❌https://check-host.net/check-report/df9cc89k288🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!"
133,134,2022-12-23 11:28:13,"🔥As advised by subscribers, we are now conducting ""stress tests"" of sites😁The portal of the Court of Appeal in Rzeszow collapsed from stress:❌https://check-host.net/check-report/df9ce27k3a5🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!"
134,135,2022-12-23 11:46:58,🔥The subdomain (job portal) of British munitions company Bae Systems did not pass our stress test:❌https://check-host.net/check-report/df9ce27k3a5🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!
135,136,2022-12-24 11:23:08,📦Our DDoS-surprise was first accepted by the Polish portal of the Public Procurement Administration:❌https://check-host.net/check-report/dfc0281ka8e🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!
136,137,2022-12-25 11:50:48,🔥There's again non-flying weather today in Poland due to ddos-hail:❌Civil Aviation Administration:https://check-host.net/check-report/dfe19c5k176❌Central database of reports of the Civil Aviation Authority:https://check-host.net/check-report/dfe1926k36c🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!
137,138,2022-12-26 09:38:33,🔥The Latvian website of the Public Services Commission is not working today: ❌https://check-host.net/check-report/e00ae52kea4🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!
138,139,2022-12-26 10:48:49,🚂The portal of the management company of Latvian Railways is also feeling bad today:❌https://check-host.net/check-report/e00e301k300🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!
139,140,2022-12-26 13:40:04,"🚂The portal of the Latvian railway, as well as its subdomains, are feeling bad today:❌Latvian Railway:https://check-host.net/check-report/e012debk430❌Latvian Railway infrastructure:https://check-host.net/check-report/e012e6dk76❌Logistics Service:https://check-host.net/check-report/e012ec9kef8❌ Freight service:https://check-host.net/check-report/e012f3bka3f❌Rolling stock service:https://check-host.net/check-report/e012fe6kb5❌Security service:https://check-host.net/check-report/e01305eka35❌Electronic maintenance service of the railway system:https://check-host.net/check-report/e0130b5kca6❌Training Center:https://check-host.net/check-report/e0130f4kd05🐻Subscribe to NoName057(16)🐻Join our DDoS-project🇷🇺Victory will be ours!"


In [5]:
# function to cut off the redundant part of each message
def shorten_string(input_string):
    # Check if the input is a string
    if isinstance(input_string, str):
        pattern = r'❌.*?https:\/\/check-host'
        
        # Search for the pattern in the input string
        match = re.search(pattern, input_string)
        
        if match:
            # Cut off the string from the start of the match
            return input_string[:match.start()]
        else:
            return input_string
    else:
        # If not a string, return it unchanged (e.g., for NaN values)
        return input_string

# trimming the texts in the df
df["Text"] = df["Text"].apply(shorten_string)
df[130:140]

Unnamed: 0,Message Id,Datetime,Text
130,131,2022-12-21 19:12:25,The Latvian portal of the financial intelligence service is not working still🔥
131,132,2022-12-22 11:02:56,🔥 Since yesterday the authorization service of the portal of grant projects of the State Agency for the Development of Education of Latvia haven't rehabilitated 🇱🇻 :
132,133,2022-12-23 11:07:07,"🔥Ziedot, a Latvian Russophobic charitable organization, started collecting donations to the Armed Forces of Ukraine, but we quickly reacted and the portal stopped working due to our DDoS attacks:"
133,134,2022-12-23 11:28:13,"🔥As advised by subscribers, we are now conducting ""stress tests"" of sites😁The portal of the Court of Appeal in Rzeszow collapsed from stress:"
134,135,2022-12-23 11:46:58,🔥The subdomain (job portal) of British munitions company Bae Systems did not pass our stress test:
135,136,2022-12-24 11:23:08,📦Our DDoS-surprise was first accepted by the Polish portal of the Public Procurement Administration:
136,137,2022-12-25 11:50:48,🔥There's again non-flying weather today in Poland due to ddos-hail:
137,138,2022-12-26 09:38:33,🔥The Latvian website of the Public Services Commission is not working today:
138,139,2022-12-26 10:48:49,🚂The portal of the management company of Latvian Railways is also feeling bad today:
139,140,2022-12-26 13:40:04,"🚂The portal of the Latvian railway, as well as its subdomains, are feeling bad today:"


In [7]:
# set of all the trimmed messages
message_set = set(df["Text"])

# function to categorize messages
def categorize_message(message):
    message = str(message)
    for principle, pattern in du.cia_principles_patterns.items():
        if re.search(pattern, message, re.IGNORECASE):
            return principle
    return 'Unknown'

# dictionary to keep track of counts
principle_counts = {principle: 0 for principle in du.cia_principles_patterns.keys()}
principle_counts["Unknown"] = 0

# assigning organizations to a sector
for message in message_set:
    principle = categorize_message(message)
    principle_counts[principle] += 1

principle_counts

{'confidentiality': 5, 'integrity': 3, 'availability': 765, 'Unknown': 2046}