In [1]:
import pandas as pd
from tqdm import tqdm

from src.parsing.regex_classifier import attempt_to_classify, construct_according_to_classification

tqdm.pandas()

db = pd.read_csv('../resources/acme_security_tickets.csv')

# regex performance on dataset
starting simply, how well does a simple regex recognize and classify the requests that we're getting?

In [2]:
db['regex_classification'] = db['details'].progress_apply(attempt_to_classify).apply(lambda txt: txt.replace('_', ' '))

requests_by_types = db.groupby('request_type')

for request_type, requests in requests_by_types:
    hit_rate = requests.apply(
        lambda row: row['request_type'].upper() == row['regex_classification'].upper(), axis=1
    ).sum() / requests.shape[0]
    print(f"hit rate for type {request_type}: {hit_rate * 100:.2f}%")

100%|██████████| 1000/1000 [00:00<00:00, 98052.74it/s]

hit rate for type Cloud Resource Access: 100.00%
hit rate for type Data Export: 100.00%
hit rate for type DevTool Install: 100.00%
hit rate for type Firewall Change: 100.00%
hit rate for type Network Access: 100.00%
hit rate for type Permission Change: 100.00%
hit rate for type Vendor Approval: 100.00%





is seems like using regex just to classify the requests is **highly accurate**, but I would guess that this indicates a data issue rather than real world performance.
even so, I will try to move one and see how far I can go with just that simple stuff.

let's start by examining every mandatory field and see how we extract it from the data.

In [3]:
db['regex_construction_validity'] = db['details'].progress_apply(
    lambda txt: construct_according_to_classification(attempt_to_classify(txt), txt)
).progress_apply(lambda req: req.is_valid())

for request_type, requests in db.groupby('request_type'):
    validity_rate = requests['regex_construction_validity'].sum() / requests.shape[0]
    print(f"successful construction rate for type {request_type}: {validity_rate * 100:.2f}%")

db[['request_type', 'mandatory_fields']].drop_duplicates().sort_values(by=['request_type'])

100%|██████████| 1000/1000 [06:58<00:00,  2.39it/s]
100%|██████████| 1000/1000 [00:00<00:00, 657723.69it/s]

hit rate for type Cloud Resource Access: 100.00%
hit rate for type Data Export: 0.00%
hit rate for type DevTool Install: 0.00%
hit rate for type Firewall Change: 100.00%
hit rate for type Network Access: 0.00%
hit rate for type Permission Change: 100.00%
hit rate for type Vendor Approval: 100.00%





Unnamed: 0,request_type,mandatory_fields
8,Cloud Resource Access,Business Justification; Data Sensitivity Level
7,Data Export,Business Justification; PII Involved; Data Des...
14,DevTool Install,Business Justification; Team Lead Approval
4,Firewall Change,Business Justification; Source System; Destina...
3,Network Access,Business Justification; Source CIDR; Approval ...
0,Permission Change,Business Justification; Duration; Manager Appr...
1,Vendor Approval,Vendor Security Questionnaire; Data Classifica...


In [None]:
db[(db['request_type'] == 'Data Export')][['details', 'mandatory_fields', 'security_risk_score']]