In [60]:
import pandas as pd
import numpy as np
import nltk, datetime, re
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz, process
from IPython.display import HTML

In [125]:
#Run notebook in conda env lexnlp
import lexnlp.extract.en.definitions
import lexnlp.extract.en.amounts
import lexnlp.extract.en.regulations



In [25]:
def test_all_options(string1, string2):
    ratio = fuzz.ratio(string1, string2)
    partial_ratio = fuzz.partial_ratio(string1, string2)
    token_sort = fuzz.token_sort_ratio(string1, string2)
    token_set = fuzz.token_set_ratio(string1, string2)
    print(string1 + ', ' + string2 + '\n' +
        'ratio: ' + str(ratio) + '\n'
        'partial_ratio: ' + str(ratio) + '\n'
        'token_sort: ' + str(ratio) + '\n'
        'token_set: ' + str(ratio) + '\n')

In [52]:
test_all_options('Vornado', 'Vornado LLC')

Vornado, Vornado LLC
ratio: 78
partial_ratio: 78
token_sort: 78
token_set: 78



In [120]:
(pd.Series(list(zip(*process.extract('Bosch', ['Bosch', 'BSCH HOME APPLIANCE CORPORATION', 'Vornado Air LLC', '(1/2/3) Bosch: blah blah'])))[1]) > 50).any()

True

In [54]:
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)
pd.set_option('expand_frame_repr', False)

In [17]:
recalls = pd.read_csv('recalls.csv', 
                      dtype={'RecallID':'Int64', 'RecallNumber': str, 'RecallDate': str,
                             'Description': str, 'URL': str, 'Title': str, 'ConsumerContact': str,
                             'LastPublishDate': str, 'Images': 'object', 'SoldAtLabel': str,
                             'Distributors_CompanyID': 'Int64', 'DistributorsSName': str, 
                             'Hazards_HazardType': str, 'Hazards_HazardTypeID': str, 'Hazards_Name': str,
                             'Importers_CompanyID': 'Int64', 'Importers_Name': str, 'Inconjunctions_URL': str,
                             'Injuries_Name': str, 'ManufacturerCountries_Country': str, 
                             'Manufacturers_CompanyID': 'Int64', 'Manufacturers_Name': str, 'ProductUPCs_UPC': str,
                             'Products_CategoryID': 'Int64', 'Products_Description': str, 'Products_Model': str,
                             'Products_Name': str, 'Products_NumberOfUnits': str, 'Products_Type': str,
                             'Remedies_Name': str, 'RemedyOptions_Option': str, 'Retailers_CompanyID': str,
                             'Retailers_Name': str})

In [18]:
#Munge and infill whatever information we can extract from the recall descriptions

##UPCs loaded as strings; clean for whitespace & non-numeric chars
recalls['ProductUPCs_UPC'] = recalls['ProductUPCs_UPC'].str.replace(' |-|\.', '')
recalls = recalls.rename(columns={'ProductUPCs_UPC': 'UPC'})
#Extract unit numbers from string phrases (e.g. "About 35")
num_units = recalls['Products_NumberOfUnits'].str.replace(',', '')
num_units = num_units.str.extract(r'(\d+)', expand=False).astype('float')
recalls['Products_NumberOfUnits'] = num_units
#Extract total number of complaints from the string column
#Parse dates from strings
recalls['RecallDate'] = pd.to_datetime(recalls['RecallDate'])
recalls['LastPublishDate'] = pd.to_datetime(recalls['LastPublishDate'])
#Break the standardized titles into helpful fields; standardized
#titles take the form "[Company] recalls [product] due to [hazard]"
titles = recalls['Title'].str.split('[Announce]?[s]?Recall[s]?[ed]?|Due to', expand=True)
titles = titles.rename(columns={0: 'CompanyShortname', 1: 'ProductsShortname', 2: 'HazardAlt'})
recalls = pd.concat([recalls, titles], axis=1)

In [68]:
#Vectorize over a series of strings
def extract_probable_specifiers(text):
    if pd.isnull(text):
        text=''
    pattern = r"(([0-9A-Z])+[a-z]*([\\-]?[\\.*]?[0-9A-Z]*)*){2,}"
    matches = re.finditer(pattern, text)
    unique_matches = set([match.group() for matchNum, match in enumerate(matches)])
    return list(unique_matches)

In [130]:
# TODO combine vornado and bosch for a test set
reports = v_reports
recalls = v_recalls

#Funnel match
#Preparation: extract possible brands from the fields likely to contain them, for both the reports and the recalls.
#Brands may not be referred to by a consistent name across the two datasets.
brand_from_comments = [report[1] if not isinstance(report, float) else '' for report in reports['Company Comments'].str.split('\\) |:')]
reports['candidate_brand'] = list(zip(reports['Brand'], 
                                           reports['Manufacturer / Importer / Private Labeler Name'],
                                           brand_from_comments))
recalls['candidate_brand'] = list(zip(recalls['CompanyShortname'], recalls['Manufacturers_Name'],
                                     recalls['Distributors_Name'], recalls['Importers_Name']))

#Preparation: extract alphanumeric strings that are likely to be model numbers, serial numbers, or UPCs.
#Recall notices are very unlikely to have the dedicated fields populated, but tend to mention them in the
#text of the recall announcement.
#Reports tend to have this information in the dedicated fields, but as a precaution we also try to pull it
#from the unstructured text.

reports['specifiers'] = [extract_probable_specifiers(report) for report in 
                         [reports['Product Description'] + ' '+ reports['Incident Description']][0]]

recalls['specifiers'] = [extract_probable_specifiers(recall) for recall in recalls['Description']]

#Phase 1
#For each complaint:
#for each candidate brand: 'CompanyShortname', 'Manufacturers_Name', 'Distributors_Name', 'Importers_Name'
#fuzzy match to the possible brands list from each recall notice
#if any matches score > 50, save recall as candidate (add a column that contains a list of probable recall IDs)
#if no matches score > 50, label complaint as "no recall"
possible_matches = []
for i in range(len(reports)):
    report = reports.iloc[i, :]
    match_ids = []
    for c in report['candidate_brand']:
        match_ids += [recalls.iloc[r]['RecallID'] for r in range(len(recalls)) if
                        (pd.Series(list(zip(*process.extract(c, recalls.iloc[r]['candidate_brand'])))[1]) > 50).any()]
    possible_matches.append(list(set(match_ids)))
#Phase 2
#For all complaints that have candidate recall numbers:
#For each candidate recall:
#fuzzy match to possible products using product type from report & product name from recall; retain match if score > 50
#on any
#finally, take the specifiers extracted from the recall description and look for an exact match in the following order:
#Model name or number
#serial number
#UPC
#specifiers extracted from the product description
#speficiers extracted from incident description

In [69]:
descs = [extract_probable_specifiers(dsc) for dsc in reports['Incident Description'].sample(n=10)]

In [131]:
brand_matches

[]

In [469]:
recalls['CompanyShortname'].iloc[0:10]

0            The Thompson’s Company 
1    Boston Warehouse Trading Corp. 
2                      Libbey Glass 
3        Thesaurus Global Marketing 
4            Bosch Thermotechnology 
5                          Hallmark 
6                         BCI Burke 
7                               BMC 
8                          Toysmith 
9               Hillsdale Furniture 
Name: CompanyShortname, dtype: object

In [5]:
reports = pd.read_csv('SPDB/IncidentReports.csv', encoding="ISO-8859-1", dtype='object')

In [72]:
def get_matched_brands(brand, reports, recalls):
    brand_reports = reports[reports['Manufacturer / Importer / Private Labeler Name'].str.contains(brand, case=False) 
                            | reports['Brand'].str.contains(brand, case=False)
                            | reports['Incident Description'].str.contains(brand, case=False)]
    brand_recalls = recalls[recalls['CompanyShortname'].str.contains(brand, case=False) 
                            | recalls['Manufacturers_Name'].str.contains(brand, case=False) 
                            | recalls['Importers_Name'].str.contains(brand, case=False) 
                            | recalls['Distributors_Name'].str.contains(brand, case=False) 
                            | recalls['Retailers_Name'].str.contains(brand, case=False)]
    return (brand_reports, brand_recalls)


In [93]:
v_reports, v_recalls = get_matched_brands('Vornado', reports, recalls)

In [94]:
b_reports, b_recalls = get_matched_brands('Bosch', reports, recalls)