##### 1. Import Dependencies

In [2]:
import pandas as pd
import os, yaml, re

##### 2. Verify Source File

In [4]:
def checkSourceFiles():
    # Ensure 'source/' directory exists
    if not os.path.exists('source'):
        os.makedirs('source')
        print("Created 'source/' directory.")

    # Load and display the CSV file
    csv_path = os.path.join('source', 'timeline.csv')

    if os.path.isfile(csv_path):
        return True

    else:
        print("'timeline.csv' not found inside 'source/'. Please place it there before running this script.")
        return False

if checkSourceFiles():
    df = pd.read_csv('source/timeline.csv', low_memory=False)
    print(df.columns)
    print(df.head())

    missing = df.isnull().sum()
    missing_df = pd.DataFrame({
        "Column": missing.index,
        "Missing Values": missing.values
    })

    print(missing_df)


Index(['date', 'time', 'timezone', 'MACB', 'source', 'sourcetype', 'type',
       'user', 'host', 'short', 'desc', 'version', 'filename', 'inode',
       'notes', 'format', 'extra'],
      dtype='object')
         date      time timezone  MACB   source      sourcetype  \
0  01/01/1601  00:00:00      UTC  .A..  WEBHIST  Chrome Cookies   
1  01/01/1601  00:00:00      UTC  .A..  WEBHIST  Chrome Cookies   
2  01/01/1601  00:00:00      UTC  .A..  WEBHIST  Chrome Cookies   
3  00/00/0000  --:--:--        -  M...     FILE       File stat   
4  00/00/0000  --:--:--        -  M...     FILE       File stat   

                        type user      host  \
0           Last Access Time    -  TIMELORD   
1           Last Access Time    -  TIMELORD   
2           Last Access Time    -  TIMELORD   
3  Content Modification Time    -  TIMELORD   
4  Content Modification Time    -  TIMELORD   

                                               short  \
0                                     bing.cn (MUID) 

In [5]:
df.head()

Unnamed: 0,date,time,timezone,MACB,source,sourcetype,type,user,host,short,desc,version,filename,inode,notes,format,extra
0,01/01/1601,00:00:00,UTC,.A..,WEBHIST,Chrome Cookies,Last Access Time,-,TIMELORD,bing.cn (MUID),http://bing.cn/ (MUID) Flags: [HTTP only] = Fa...,2,NTFS:\Users\timel\AppData\Local\Packages\Micro...,191216,-,sqlite/chrome_66_cookies,data: ; host: bing.cn; path: /; secure: False;...
1,01/01/1601,00:00:00,UTC,.A..,WEBHIST,Chrome Cookies,Last Access Time,-,TIMELORD,msn.cn (MUID),http://msn.cn/ (MUID) Flags: [HTTP only] = Fal...,2,NTFS:\Users\timel\AppData\Local\Packages\Micro...,191216,-,sqlite/chrome_66_cookies,data: ; host: msn.cn; path: /; secure: False; ...
2,01/01/1601,00:00:00,UTC,.A..,WEBHIST,Chrome Cookies,Last Access Time,-,TIMELORD,bing.com (MUID),http://bing.com/ (MUID) Flags: [HTTP only] = F...,2,NTFS:\Users\timel\AppData\Local\Packages\Micro...,191216,-,sqlite/chrome_66_cookies,data: ; host: bing.com; path: /; secure: False...
3,00/00/0000,--:--:--,-,M...,FILE,File stat,Content Modification Time,-,TIMELORD,\Program Files\WindowsApps\MSTeams_25212.2204....,GZIP:\Program Files\WindowsApps\MSTeams_25212....,2,GZIP:\Program Files\WindowsApps\MSTeams_25212....,-,-,filestat,file_size: 52650; file_system_type: GZIP; is_a...
4,00/00/0000,--:--:--,-,M...,FILE,File stat,Content Modification Time,-,TIMELORD,\Program Files\WindowsApps\MSTeams_25212.2204....,GZIP:\Program Files\WindowsApps\MSTeams_25212....,2,GZIP:\Program Files\WindowsApps\MSTeams_25212....,-,-,filestat,file_size: 101278; file_system_type: GZIP; is_...


In [6]:
print(f"Timezone: {df['timezone'].unique()}")
print(f"Source: {df['source'].unique()}")
print(f"Source Types: {df['sourcetype'].unique()}")
print(f"Type: {df['type'].unique()}")

Timezone: ['UTC' '-']
Source: ['WEBHIST' 'FILE' 'PE' 'LNK' 'REG' 'AMCACHE' 'LOG' 'OLECF' 'EVT']
Source Types: ['Chrome Cookies' 'File stat' 'PE/COFF file' 'Windows Shortcut'
 'UserAssist Registry Key' 'AppCompatCache Registry Key'
 'Amcache Registry Entry' 'Registry Key' 'File entry shell item' 'System'
 'Service/Driver Configuration Registry Key' 'OLECF Item'
 'OLECF Summary Info' 'Run/Run Once Registry Key'
 'Task Cache Registry Key' 'Winlogon Registry Key'
 'Typed URLs Registry Key' 'Setup API Log' 'WinEVTX'
 'Background Activity Moderator Registry Key'
 'User Account Information Registry Key' 'BagMRU Registry Key'
 'Chrome History' 'Program Compatibility Assistant (PCA) Log'
 'Network Connection Registry Key' 'USB Registry Key' 'Chrome Cache'
 'NTFS USN change' 'MRUListEx Registry Key' 'Shutdown Registry Key']
Type: ['Last Access Time' 'Content Modification Time' 'Expiration Time'
 'Creation Time' 'Content Modification Time; Creation Time' 'Not a time'
 'Link Time' 'Creation Time; 

##### 3. Data Cleaning & Processing

- Drop irrelevant content that are out of scope
- Process invalid datetime

In [7]:
df = df[~df["source"].isin(["WEBHIST"])]

def processTimestamps(df):
    combined = df["date"].astype(str).str.strip() + " " + df["time"].astype(str).str.strip()

    # Try to parse; invalid ones become NaT
    df["datetime"] = pd.to_datetime(
        combined,
        format="%m/%d/%Y %H:%M:%S",
        errors="coerce"
    )

    # Flag invalid or placeholder (1601) timestamps
    df["is_valid_time"] = True

    df.loc[
        (df["datetime"].isna()) | (df["datetime"].dt.year == 1601),
        "is_valid_time"
    ] = False


    return df

df = processTimestamps(df)
df.head()

Unnamed: 0,date,time,timezone,MACB,source,sourcetype,type,user,host,short,desc,version,filename,inode,notes,format,extra,datetime,is_valid_time
3,00/00/0000,--:--:--,-,M...,FILE,File stat,Content Modification Time,-,TIMELORD,\Program Files\WindowsApps\MSTeams_25212.2204....,GZIP:\Program Files\WindowsApps\MSTeams_25212....,2,GZIP:\Program Files\WindowsApps\MSTeams_25212....,-,-,filestat,file_size: 52650; file_system_type: GZIP; is_a...,NaT,False
4,00/00/0000,--:--:--,-,M...,FILE,File stat,Content Modification Time,-,TIMELORD,\Program Files\WindowsApps\MSTeams_25212.2204....,GZIP:\Program Files\WindowsApps\MSTeams_25212....,2,GZIP:\Program Files\WindowsApps\MSTeams_25212....,-,-,filestat,file_size: 101278; file_system_type: GZIP; is_...,NaT,False
24,00/00/0000,--:--:--,-,M...,FILE,File stat,Content Modification Time,-,TIMELORD,\Program Files\WindowsApps\MSTeams_25212.2204....,GZIP:\Program Files\WindowsApps\MSTeams_25212....,2,GZIP:\Program Files\WindowsApps\MSTeams_25212....,-,-,filestat,file_size: 86012; file_system_type: GZIP; is_a...,NaT,False
25,00/00/0000,--:--:--,-,M...,FILE,File stat,Content Modification Time,-,TIMELORD,\Program Files\WindowsApps\MSTeams_25212.2204....,GZIP:\Program Files\WindowsApps\MSTeams_25212....,2,GZIP:\Program Files\WindowsApps\MSTeams_25212....,-,-,filestat,file_size: 17567; file_system_type: GZIP; is_a...,NaT,False
26,00/00/0000,--:--:--,-,M...,FILE,File stat,Content Modification Time,-,TIMELORD,\Program Files\WindowsApps\MSTeams_25212.2204....,GZIP:\Program Files\WindowsApps\MSTeams_25212....,2,GZIP:\Program Files\WindowsApps\MSTeams_25212....,-,-,filestat,file_size: 144861; file_system_type: GZIP; is_...,NaT,False


##### 4. Form Linked Entities

In [None]:
linkedEntities = {}

def deriveLinkedEntities(row):
    """Derivation of linked entities ID based on analysis of sources"""
    src = row.get("source", "").lower().strip()
    srctype = row.get("sourcetype", "").lower().strip()
    short = row.get("short", "").lower().strip()
    inode = row.get("inode", "")
    filename = str(row.get("filename", "")).lower().strip()
    macb = row.get("MACB", "").lower().strip()
    datetime = row.get("datetime", "")
    isValidTime = row.get("is_valid_time")

    # Convert timestamp for readability
    datetime_str = datetime.strftime("%Y-%m-%d %H:%M:%S") if pd.notna(datetime) else None

    # Anchor using $MFT entries: prefetch & MFT share the same src & srctype; can be differentiated using 'short'
    if src == "file" and srctype == "file stat" and "prefetch" not in short:
        # print(f"Found $MFT for {filename}")
        logType = "$MFT"
        
        if filename not in linkedEntities:
            linkedEntities[filename] = {}
        
        if logType not in linkedEntities[filename]:
            linkedEntities[filename][logType] = []
            
        linkedEntities[filename][logType].append({
            "datetime": datetime_str,
            "isValidTime": isValidTime,
            "shorts": short,
            "macb": macb
        })

df.apply(deriveLinkedEntities, axis=1)

# Note: Avoid printing in Jupyter to prevent truncation
# print(linkedEntities)


##### x. Parse YAML Rules File (To Be Completed After Linking Entities & Logic For Detection)

In [None]:
with open("rules.yaml", "r") as f:
    rule_config = yaml.safe_load(f)["rules"]
    # print(rule_config)

def evaluate_rules(rule, events):
    condition = rule["condition"]

    # if "earlier_event" in condition and "later_event" in condition:
    #     # Identify all events matching the earlier_event criteria
    #     earlier_matches = events[events["desc"].str.contains("|".join(condition["earlier_event"]), case=False, na=False)]