In [2]:
import pandas as pd
import numpy as np

import re
import json
import sqlite3
import labels
import extract_msg

from model import SensitiveDataDetector
from const import *

In [3]:
labels_df = labels.load_labels()

In [4]:
FORMAT = "xlsx"

df_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == True)]
df_non_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == False)]

In [5]:
def regex_filter(string): 
    return {
        "email": re.search(PATTERN_EMAIL, string) is not None, 
        "iban": re.search(PATTERN_IBAN, string) is not None, 
        "rsa": re.search(PATTERN_RSA, string) is not None
    }

In [6]:
def is_sensitive(filename): 

    loader = pd.read_csv if ".csv" in filename else pd.read_excel
    df = loader(filename, nrows = 20)

    # Check 1: Has header
    has_header = not any([not isinstance(col, str) for col in df.columns])
    if has_header: # If there is a header

        headers = df.columns
        headers = [col.lower().strip() for col in df.columns]

        if sum(sens in headers for sens in SENSITIVE_HEADERS) > 1: 
            # print("Check 1", filename)
            return True

    # Check 2: Has email or IBAN

    df_string = df.__str__()
    regex = regex_filter(df_string)
    has_email, has_iban = regex["email"], regex["iban"]

    if has_email and not "git@" in df_string: # Only true if it isn't github email...
        # print("Check 2 (MAIL)", filename)
        return  True 
    
    if has_iban: 
        # print("Check 2 (IBAN)", filename)
        return True
    
    # Check 3: Has a sensitive token in it
    if sum(sens in df_string for sens in SENSITIVE_TOKENS) > 1: 
        # print("Check 3", filename)
        return True

    # print(filename)
    return False

In [7]:
np.mean([is_sensitive("../files/sorted/" + FORMAT + "/" + filename) for filename in df_sens["filename"]])
np.mean([is_sensitive("../files/sorted/" + FORMAT + "/" + filename) for filename in df_non_sens["filename"]])

0.0

In [8]:
FORMAT = "db"

df_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == True)]
df_non_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == False)]

In [9]:
def is_sensitive(filename, detector = None):

    con = sqlite3.connect(filename)

    table_names = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", con)
    df = pd.read_sql('SELECT * FROM ' + table_names.loc[0][0], con)

    header_sensitive = [col for col in df.columns if col in SENSITIVE_HEADERS]
    df_sensitive = df[header_sensitive]

    entries = ~(df_sensitive.apply(lambda x: isinstance(x, str) and x.strip() == '')) | (df_sensitive.isnull())
    has_multiple_sensitive = any(entries.sum(axis=1) > 1)

    return has_multiple_sensitive

In [10]:
np.mean([is_sensitive("../files/" + filename) for filename in df_sens["filename"]])
np.mean([is_sensitive("../files/" + filename) for filename in df_non_sens["filename"]])

1.0

In [11]:
FORMAT = "weird"

df_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == True)]
df_non_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == False)]

In [14]:
df_sens

Unnamed: 0,filename,sensitive,name,extension
16,time-rich-city-pattern-land-staff,True,time-rich-city-pattern-land-staff,weird
23,south-million-lead-billion-air,True,south-million-lead-billion-air,weird
27,summer-someone-cell-happy-around-red,True,summer-someone-cell-happy-around-red,weird
32,role-six-whole-single-enough-think,True,role-six-whole-single-enough-think,weird
34,necessary-receive-event,True,necessary-receive-event,weird
38,wait-business-must-we-just-yet,True,wait-business-must-we-just-yet,weird
414,such-prepare-song,True,such-prepare-song,weird
415,take-a-professor,True,take-a-professor,weird
420,themselves-agent,True,themselves-agent,weird


In [15]:
regex_filter(text)

NameError: name 'text' is not defined

In [16]:
FORMAT = "msg"

df_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == True)]
df_non_sens = labels_df[(labels_df["extension"] == FORMAT) & (labels_df["sensitive"] == False)]

In [17]:
df_sens

Unnamed: 0,filename,sensitive,name,extension
4,debitis.msg,True,debitis,msg
5,incidunt-officia.msg,True,incidunt-officia,msg


In [None]:
df_non_sens

Unnamed: 0,filename,sensitive,name,extension
665,mollitia-quo-autem.msg,False,mollitia-quo-autem,msg
666,quaerat-a-quia.msg,False,quaerat-a-quia,msg
667,vel-fugiat.msg,False,vel-fugiat,msg
668,dolorum-voluptate.msg,False,dolorum-voluptate,msg
669,qui-debitis.msg,False,qui-debitis,msg


In [None]:
PATTERN_NEWLINE = re.compile(r"(?:\r\n|\r|\n)")
PATTERN_WHITESPACE = re.compile(r"\s\s+")

In [18]:
def is_sensitive(filename, detector = None): 

    msg = None

    for encoding in ENCODINGS: 
        try:
            msg = extract_msg.Message(filename, overrideEncoding=encoding)
            break
        except Exception:
            pass

    body = (msg.body + msg.subject).lower()
    body = re.sub(PATTERN_NEWLINE, " ", body)
    body = re.sub(PATTERN_WHITESPACE, " ", body)

    recipients = [recipient.name for recipient in msg.recipients]

    msg.close()

    # Check 1: Pair of email and same name in body
    for recipient in recipients: 
        names = recipient.split("@")
        if len(names) > 1: 
            names = names[0].split(".")
            count = sum([name in body for name in names])

            if count > 1: 
                return True

    # Check 2: Check for another clue
    clues = np.array(detector.is_sensitive(body))
    return clues.sum() > 1

In [19]:
detector = SensitiveDataDetector()

In [20]:
np.mean([is_sensitive("../files/" + filename, detector) for filename in df_sens["filename"]])

0 0 1 0


0.5

In [None]:
recipients = json

TypeError: string indices must be integers