In [None]:
import pandas as pd
import re
import json
import csv
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from src import common

source of similarity matcher: https://stackoverflow.com/questions/6400416/figure-out-if-a-business-name-is-very-similar-to-another-one-python

In [None]:
def to_clean_tokens(firmname):
    """"""
    
    decrease = 1
    while decrease > 0:
        start_len = len(firmname)
        firmname = firmname.replace('  ', ' ')
        decrease = start_len - len(firmname)
    tokens = firmname.split()
    tokens_cleans = [re.sub("[^0-9a-zA-Z]+", "", t).lower() for t in tokens]
    tokens_cleans = [t for t in tokens_cleans if t not in stopwords.words('english')]
    tokens_filtered = [t for t in tokens_cleans if len(t) > 0]
    
    return tokens_filtered


def sequence_uniqueness(tokens, token_frequency_dict):
    """"""
    
    return sum(1 / token_frequency_dict[t] ** 0.5 for t in tokens)


def name_similarity(name_a, name_b, token_frequency):
    """"""
    
    a_tokens = set(name_a)
    b_tokens = set(name_b)
    a_uniq = sequence_uniqueness(name_a, token_frequency)
    b_uniq = sequence_uniqueness(name_b, token_frequency)
    if a_uniq == 0 or b_uniq == 0:
        return 0
    else:
        return sequence_uniqueness(a_tokens.intersection(b_tokens), token_frequency) / (a_uniq * b_uniq) ** 0.5

    
def build_token_frequency_table(token_lists):
    """"""
    
    tokens = [str(token) for s in token_lists for token in s]
    return Counter(tokens)


def count_prc_existence(df_fortune, df_prc):
    
    fortune_companies = list(df_fortune['firm'].values)
    prc_companies = list(df_prc['Company'].values)

    fortune_companies_tokenized = [to_clean_tokens(f) for f in fortune_companies]
    prc_companies_tokenized = [to_clean_tokens(f) for f in prc_companies]
    all_companies_tokenized = [*prc_companies_tokenized, *fortune_companies_tokenized]
    
    token_frequency = build_token_frequency_table(all_companies_tokenized)
    
    prc_existance = {}
    for firmname, firmtokens in zip(fortune_companies, fortune_companies_tokenized):
        prc_existance[common.__hash(firmname)] = 0
        for matchtokens in prc_companies_tokenized:
            matchscore = name_similarity(firmtokens, matchtokens, token_frequency)
            if matchscore >= 0.7:
                prc_existance[common.__hash(firmname)] += 1
                
    return prc_existance, token_frequency

In [None]:
df_prc = pd.read_csv('../data/dbs/prc_data_breach_chronology.1.13.20.csv')
df_fortune = pd.read_csv('../data/fortune/f500_firm_sample.csv')

df_prc['Date Made Public'] = pd.to_datetime(df_prc['Date Made Public'], format='%m/%d/%Y')
df_prc_filtered = df_prc.loc[lambda x: x['Date Made Public']>='2010-01-01']

firm_data_breaches, _ = count_prc_existence(df_fortune, df_prc_filtered)

In [None]:
df_firm_data_breaches = pd.DataFrame({
    'firmhash': firm_data_breaches.keys(),
    'n_data_breaches': firm_data_breaches.values()
})

In [None]:
with open('../data/breaches/prc_firm_data_breach_matches.json', 'w') as outstream:
    json.dump(firm_data_breaches, outstream)
df_firm_data_breaches.to_csv('../data/breaches/prc_firm_data_breach_matches.csv',
                             index=False)

### - devs

In [None]:
name_similarity(['apple', 'inc', 'union', 'llc'], ['apple'], _)