In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import logging
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

import common
import config

logging.basicConfig(level=logging.DEBUG)
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)

matplotlib_logger = logging.getLogger("matplotlib")
matplotlib_logger.setLevel(logging.WARNING)

In [3]:
datestr = "2024-06-26"

manager = common.DatasetManager(config.TLHOP_EPSS_REPORT_PATH)
manager.load_datasets([datestr])

votes_df = manager.votes_df.copy()
manager.join_votes_shodan_df(votes_df, datestr)

features_df = manager.build_features_df(votes_df)
shodan_full_df = manager.datestr2df[datestr]

INFO:root:Selected 2 of 2 commits, start=2024-05-02 end=2024-06-26


INFO:root:Loaded classifications for 43754 orgs
INFO:root:Loaded classifications for 247311 CVEs
INFO:root:KEV database has 1222 CVEs
  df = pandas.read_sql(query, connection)
INFO:root:Read table users from PostgreSQL, 8 rows
INFO:root:Read table votes from PostgreSQL, 1685 rows
INFO:root:Loaded 1685 votes from 8 users
INFO:root:Merged Shodan columns
INFO:root:Joined CVE columns
INFO:root:Merged organization features


In [6]:
pd.set_option("display.max_columns", None)

features_df.head()
# features_df[features_df["username"] == "cunha"]

Unnamed: 0,username,vote,port,device,devicetype,in_kev,cve_c_denial_of_service,cve_c_remote_code_execution,cve_c_information_disclosure,cve_c_buffer_overflow,cve_c_privilege_escalation,cve_c_cross_site_request_forgery,cve_c_cross_site_scripting,cve_c_sql_injection,num_vulns,num_crit_sev,num_high_sev,max_epss,max_cvss,num_hostnames,num_domains,num_cpes,org_c_cloud_computing,org_c_internet_service_provider,org_c_store,org_c_security,org_c_research,org_c_healthcare,org_c_bank,org_c_education,org_c_military,org_c_government
0,thelegendofgab,6,70,,,False,0.724041,0.074447,0.013659,0.095591,0.01762,0.060627,0.008745,0.00527,22.0,0.0,5.0,0.80243,7.5,1.0,1.0,3.0,0.788773,0.031545,0.0167,0.116006,0.016923,0.005996,0.005969,0.006002,0.00513,0.006955
1,thelegendofgab,5,80,,,True,0.9859,0.005328,0.002411,0.00135,0.001395,0.001554,0.001182,0.000881,11.0,1.0,7.0,0.73185,9.8,1.0,1.0,1.0,0.126765,0.435716,0.074767,0.03666,0.040236,0.059886,0.071733,0.04859,0.032534,0.073114
2,thelegendofgab,7,8443,,,True,0.013328,0.530928,0.005,0.239043,0.180293,0.016467,0.013258,0.001682,53.0,17.0,11.0,0.97472,9.8,3.0,1.0,2.0,0.015326,0.920596,0.025628,0.009945,0.010485,0.003004,0.003419,0.004674,0.003514,0.003409
3,thelegendofgab,8,9443,,,True,0.027171,0.408511,0.208021,0.023378,0.046492,0.049184,0.211155,0.026088,134.0,58.0,11.0,0.96683,9.8,1.0,1.0,1.0,0.788773,0.031545,0.0167,0.116006,0.016923,0.005996,0.005969,0.006002,0.00513,0.006955
4,thelegendofgab,9,80,,,False,0.004925,0.98106,0.004975,0.001065,0.004634,0.001124,0.001167,0.001049,51.0,15.0,6.0,0.9687,9.8,1.0,1.0,6.0,0.460607,0.108388,0.058657,0.067615,0.142052,0.035229,0.027341,0.042349,0.02872,0.029042


In [7]:
def train_model(features_df: pd.DataFrame) -> tuple[XGBRegressor, list[str]]:
    x_train = features_df.drop(columns=["username", "vote"])
    feature_names = x_train.columns

    y_train = features_df["vote"]

    model = XGBRegressor(
        n_estimators=10000,
        learning_rate=0.1,
        max_depth=30,
        verbosity=0,
        random_state=config.RANDOM_STATE,
        enable_categorical=True,
    )
    model.fit(x_train, y_train)
    return model, list(feature_names)


user2model = {}
for user in votes_df["username"].unique():
# for user in ["cunha"]:
    logging.info("Training model for user %s", user)
    filtered_features_df = features_df[features_df["username"] == user].copy()
    user2model[user], feature_names = train_model(filtered_features_df)


logging.info("Training model for all users")
all_features_df = features_df.copy()
all_features_df["username"] = "all"
user2model["all"], feature_names = train_model(all_features_df)

INFO:root:Training model for user thelegendofgab


INFO:root:Training model for user chicoin
INFO:root:Training model for user leoomaia
INFO:root:Training model for user thiagohbs
INFO:root:Training model for user cunha
INFO:root:Training model for user pep
INFO:root:Training model for all users


In [8]:
user2predictions = {}
shodan_df = manager.datestr2df[datestr].copy()
for user, model in user2model.items():
    logging.info("Generating predictions for user %s", user)
    full_features_df = manager.build_features_df(shodan_df, votes=False)
    predictions = model.predict(full_features_df)
    shodan_df[f'predicted_vote_{user}'] = predictions
    user2predictions[user] = predictions

INFO:root:Generating predictions for user thelegendofgab


INFO:root:Joined CVE columns
INFO:root:Merged organization features
INFO:root:Generating predictions for user chicoin
INFO:root:Joined CVE columns
INFO:root:Merged organization features
INFO:root:Generating predictions for user leoomaia
INFO:root:Joined CVE columns
INFO:root:Merged organization features
INFO:root:Generating predictions for user thiagohbs
INFO:root:Joined CVE columns
INFO:root:Merged organization features
INFO:root:Generating predictions for user cunha
INFO:root:Joined CVE columns
INFO:root:Merged organization features
INFO:root:Generating predictions for user pep
INFO:root:Joined CVE columns
INFO:root:Merged organization features
INFO:root:Generating predictions for user all
INFO:root:Joined CVE columns
INFO:root:Merged organization features


In [None]:
pd.set_option("display.max_columns", None)
shodan_df.head()

In [None]:
os.makedirs(config.OUTPUT_PATH / "batch6", exist_ok=True)
shodan_df.to_csv(config.OUTPUT_PATH / "batch6/full.csv")
print(shodan_df.dtypes)

meta_id                                       object
timestamp                        datetime64[us, UTC]
ip_str                                        object
org                                           object
org_clean                                     object
isp                                           object
data                                          object
port                                        category
hostnames                                     object
domains                                       object
city                                          object
region_code                                   object
latitude                                     float64
longitude                                    float64
os                                            object
device                                      category
devicetype                                  category
cpe23                                         object
http                                          

In [None]:
for user in user2model:
    top100_user_df = shodan_df.sort_values(by=f'predicted_vote_{user}', ascending=False).head(100)
    top100_all_df = shodan_df.sort_values(by='predicted_vote_all', ascending=False).head(100)
    sampled10_user_df = top100_user_df.sample(n=10)
    sampled10_all_df = top100_all_df.sample(n=10)
    sampled20_df = pd.concat([sampled10_user_df, sampled10_all_df])
    os.makedirs(config.OUTPUT_PATH / f"batch4/{user}/", exist_ok=True)
    sampled20_df.to_parquet(config.OUTPUT_PATH / f"batch4/{user}/records.parquet")

meta_id                                       object
timestamp                        datetime64[us, UTC]
ip_str                                        object
org                                           object
org_clean                                     object
isp                                           object
data                                          object
port                                        category
hostnames                                     object
domains                                       object
city                                          object
region_code                                   object
latitude                                     float64
longitude                                    float64
os                                            object
device                                      category
devicetype                                  category
cpe23                                         object
http                                          

In [None]:
for user in votes_df["username"].unique():
    model = user2model[user]
    plt.figure(figsize=(10, 5))
    plt.barh(feature_names, model.feature_importances_)
    plt.xlabel("Importance")
    plt.title(f"Feature Importance for User {user} from XGBRegressor")
    plt.show()

In [None]:
for user in votes_df["username"].unique():
    predictions = user2predictions[user]
    plt.hist(predictions, bins=20)
    plt.xlabel("Predicted Vote")
    plt.ylabel("Frequency")
    plt.title(f"Distribution of Predicted Votes for User {user}")
    plt.show()