In [8]:
import os
import thrember
import numpy as np
import pandas as pd
import polars as pl
import altair as alt
import lightgbm as lgb
import matplotlib.pylab as plt
from sklearn.metrics import roc_auc_score, roc_curve
alt.renderers.enable('default')

RendererRegistry.enable('default')

In [9]:
data_dir = "/data/EMBER2024/" # change this to where you unzipped the download

In [3]:
train_df, test_df, challenge_df = thrember.read_metadata(data_dir)

In [10]:
# Add a 'week' column to the dataframe
plotdf = pl.concat([train_df, test_df])
start_date = pd.Timestamp("2023-09-24")
plotdf = plotdf.with_columns(
    pl.from_epoch("first_submission_date", time_unit="s").alias("first_submission_dt")
)
plotdf = plotdf.with_columns(
    (
        (pl.col("first_submission_dt") - pl.lit(start_date)).dt.total_days() // 7
    ).cast(pl.Int64).alias("week")
)

print(plotdf.shape)

# Plot file types across weeks
gbdf = plotdf.group_by(["file_type", "week"]).agg(pl.len().alias("count"))
alt.Chart(gbdf).mark_bar().encode(
    alt.X('week:O', axis=alt.Axis(title='Week First Seen')),
    alt.Y('count:Q', axis=alt.Axis(title='File Type')),
        alt.Color('file_type:N', scale=alt.Scale(range=["#4c78a8", "#54a24b", "#f58518",  "#88d27a",  "#9ecae9", "#ffbf79"]),
              legend=alt.Legend(values=["Win32", "Win64", "Dot_Net", "APK", "ELF", "PDF"]))
)

(3232000, 16)


In [5]:
# Get number of occurrences of each family
family_counts = plotdf.select(
    pl.col("family").value_counts().alias("family_counts")
).unnest("family_counts")
family_counts = (
    plotdf.filter(pl.col("family").is_not_null())
          .select(pl.col("family").value_counts())
          .unnest("family")
          .sort("count", descending=True)
)
family_counts.head(10)

family,count
str,u32
"""berbew""",174481
"""wacatac""",81478
"""expiro""",74339
"""cosmu""",53965
"""xmrig""",28903
"""upatre""",25296
"""sfone""",22177
"""glupteba""",21670
"""grandoreiro""",20551
"""flystudio""",18141


In [7]:
# Get number of occurrences of each behavior tag
plotdf_explode = plotdf.filter(pl.col("behavior").list.len() > 0).explode("behavior")
behavior_counts = (plotdf_explode.group_by("behavior").agg(pl.len().alias("count")).sort("count", descending=True))
behavior_counts.head(10)

behavior,count
str,u32
"""backdoor""",228349
"""virus""",121971
"""worm""",76115
"""downloader""",61523
"""spyware""",55780
"""coinminer""",37680
"""dropper""",33291
"""adware""",24867
"""phishing""",21782
"""ransom""",16279
