In [218]:
import pandas as pd
import numpy as np
import datetime, re, warnings, string
from fuzzywuzzy import fuzz, process
from pandarallel import pandarallel
from itertools import chain
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [2]:
#Run notebook in conda env lexnlp
import lexnlp.extract.en.definitions
import lexnlp.extract.en.amounts
import lexnlp.extract.en.regulations



In [26]:
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)
pd.set_option('expand_frame_repr', False)

In [357]:
reports = pd.read_csv('SPDB/IncidentReports.csv', encoding="ISO-8859-1", dtype='object')
recalls = pd.read_csv('recalls.csv', 
                      dtype={'RecallID':'Int64', 'RecallNumber': str, 'RecallDate': str,
                             'Description': str, 'URL': str, 'Title': str, 'ConsumerContact': str,
                             'LastPublishDate': str, 'Images': 'object', 'SoldAtLabel': str,
                             'Distributors_CompanyID': 'Int64', 'DistributorsSName': str, 
                             'Hazards_HazardType': str, 'Hazards_HazardTypeID': str, 'Hazards_Name': str,
                             'Importers_CompanyID': 'Int64', 'Importers_Name': str, 'Inconjunctions_URL': str,
                             'Injuries_Name': str, 'ManufacturerCountries_Country': str, 
                             'Manufacturers_CompanyID': 'Int64', 'Manufacturers_Name': str, 'ProductUPCs_UPC': str,
                             'Products_CategoryID': 'Int64', 'Products_Description': str, 'Products_Model': str,
                             'Products_Name': str, 'Products_NumberOfUnits': str, 'Products_Type': str,
                             'Remedies_Name': str, 'RemedyOptions_Option': str, 'Retailers_CompanyID': str,
                             'Retailers_Name': str})

#Munge and infill whatever information we can extract from the recall descriptions

##UPCs loaded as strings; clean for whitespace & non-numeric chars
recalls['ProductUPCs_UPC'] = recalls['ProductUPCs_UPC'].str.replace(' |-|\.', '')
recalls = recalls.rename(columns={'ProductUPCs_UPC': 'UPC'})
#Extract unit numbers from string phrases (e.g. "About 35")
num_units = recalls['Products_NumberOfUnits'].str.replace(',', '')
num_units = num_units.str.extract(r'(\d+)', expand=False).astype('float')
recalls['Products_NumberOfUnits'] = num_units
#Extract total number of complaints from the string column
#Parse dates from strings
recalls['RecallDate'] = pd.to_datetime(recalls['RecallDate'])
recalls['LastPublishDate'] = pd.to_datetime(recalls['LastPublishDate'])
#Break the standardized titles into helpful fields; standardized
#titles take the form "[Company] recalls [product] due to [hazard]"
titles = recalls['Title'].str.split('[Re]?[A]?[a]?[nnounce]?[s]?Recall[s]?[ed]?|Due to', expand=True)
titles = titles.rename(columns={0: 'CompanyShortname', 1: 'ProductsShortname', 2: 'HazardAlt'})
recalls = pd.concat([recalls, titles], axis=1)
recalls = recalls[recalls['RecallDate'] > pd.to_datetime('01/01/2000')]

In [358]:
def clean_list(str_list):
    if isinstance(str_list, str):
        str_list = [str_list]
    if not str_list:
        return []
    stemmer = SnowballStemmer("english")
    stop_words = set(stopwords.words('english'))
    pattern = re.compile('[^a-z]')
    tokens = []
    tokens = list(chain(*[i.split(' ') for i in str_list if not pd.isnull(i)]))
    cleaned_tokens = []
    for token in tokens:
        token = token.lower()
        if token in stop_words or not token:
            continue
        if not re.match(pattern, token):
            token = pattern.sub('', token)
            token = stemmer.stem(token)
            cleaned_tokens.append(token)
    return list(set(cleaned_tokens))

def clean_candidates(df, info_columns, resulting_category):
    raw_info = pd.Series(df.loc[:, info_columns].fillna('').values.tolist())
    cleaned_tokens = raw_info.apply(clean_list)
    df['clean_' + resulting_category] = cleaned_tokens
    return df

In [359]:
#Clean and extract products & brands
reports = clean_candidates(reports, ('Product Type'), 'product')
reports = clean_candidates(reports, ('Brand', 'Manufacturer / Importer / Private Labeler Name'), 'brand')
recalls = clean_candidates(recalls, ('Products_Name', 'ProductsShortname'), 'product')
recalls = clean_candidates(recalls, ('Manufacturers_Name', 'Importers_Name', 'CompanyShortname'), 'brand')

#Extract specifiers
#Preparation: extract alphanumeric strings that are likely to be model numbers, serial numbers, or UPCs.
#Recall notices are very unlikely to have the dedicated fields populated, but tend to mention them in the
#text of the recall announcement.
#Reports tend to have this information in the dedicated fields, but as a precaution we also try to pull it
#from the unstructured text.

reports['specifiers'] = [extract_probable_specifiers(report) for report in 
                         [reports['Product Description'] + ' ' + reports['Incident Description']][0]]

recalls['specifiers'] = [extract_probable_specifiers(recall) for recall in recalls['Description']]

In [77]:
#Vectorize over a series of strings
# TODO can we recognize ranges of serial numbers ("between NF830 and NF 960")
# TODO can we de-stem serial numbers with x-fillers? (NF687xxxx)
def extract_probable_specifiers(text):
    if pd.isnull(text):
        text=''
    pattern = r"(([0-9A-Z])+[a-z]*([\\-]?[\\.*]?[0-9A-Z]*)*){2,}"
    matches = re.finditer(pattern, text)
    unique_matches = set([match.group() for matchNum, match in enumerate(matches)])
    return list(unique_matches)

In [157]:
#reports = pd.concat([v_reports, b_reports], axis=0, sort=False)
#recalls = pd.concat([v_recalls, b_recalls], axis=0, sort=False)

#Funnel match
#Preparation: extract possible brands from the fields likely to contain them, for both the reports and the recalls.
#Brands may not be referred to by a consistent name across the two datasets.
def prepare_fields(reports, recalls):
    brand_from_comments = [report[1] if not isinstance(report, float) else '' for report in reports['Company Comments'].str.split('\\) |:')]
    reports['candidate_brand'] = list(zip(reports['Brand'], 
                                               reports['Manufacturer / Importer / Private Labeler Name'],
                                               brand_from_comments))
    reports['candidate_brand'] = reports['candidate_brand'].apply(lambda x: list(set(x)))
    recalls['candidate_brand'] = list(zip(recalls['CompanyShortname'], recalls['Manufacturers_Name'],
                                         recalls['Distributors_Name'], recalls['Importers_Name']))
    recalls['candidate_brand'] = recalls['candidate_brand'].apply(lambda x: list(set(x)))

    #Preparation: extract alphanumeric strings that are likely to be model numbers, serial numbers, or UPCs.
    #Recall notices are very unlikely to have the dedicated fields populated, but tend to mention them in the
    #text of the recall announcement.
    #Reports tend to have this information in the dedicated fields, but as a precaution we also try to pull it
    #from the unstructured text.

    reports['specifiers'] = [extract_probable_specifiers(report) for report in 
                             [reports['Product Description'] + ' '+ reports['Incident Description']][0]]

    recalls['specifiers'] = [extract_probable_specifiers(recall) for recall in recalls['Description']]
    
    return reports, recalls

#Phase 1
#For each complaint:
#for each candidate brand: 'CompanyShortname', 'Manufacturers_Name', 'Distributors_Name', 'Importers_Name'
# - Fuzzy match to the possible brands list from each recall notice
# - If any matches score > threshold, save recall as candidate (add a column that contains a list of probable recall IDs)
# - If no matches score > threshold, label complaint as "no recall"
#Phase 2
#For all complaints that have candidate recall numbers:
#For each candidate recall:
# - Fuzzy match to possible products using product type from report & product name from recall
# - Retain match if score > threshold on any
#Finally, take the specifiers extracted from the recall description and look for an exact match in the following order:
# - Model name or number
# - Serial number
# - UPC
# - Specifiers extracted from the product description

# TODO badly need to optimize; convert to map() instead of nested fors; collapse brands together,
# https://github.com/nalepae/pandarallel still too slow
# consider looking for the phase "not recalled" or "no recall" in any of the text
# add consensus measures
# todo the associated report numbers do exist, in a few cases

def find_match(report, recalls, threshold=60):
    match_ids = []
    for c in report['cleaned_brand_candidates']:
        if pd.isnull(c) or not c:
            continue
        else:
            match_ids += [recalls.iloc[r]['RecallID'] for r in range(len(recalls)) if
                        (pd.Series(list(zip(*process.extract(c, 
                                                             recalls.iloc[r]['candidate_brand'], 
                                                             limit=len(recalls.iloc[r]['candidate_brand']),
                                                            scorer = fuzz.token_set_ratio)))[1]) > threshold).any()]
    if not match_ids:
        return 0
    else:
        candidate_recalls = recalls[recalls['RecallID'].apply(lambda x: x in match_ids)]
        product_match = process.extract(report['Product Type'], list(candidate_recalls['Products_Name']), 
                                        limit=len(candidate_recalls), scorer = fuzz.token_set_ratio)
        v = pd.Series(list(zip(*product_match))[1]).values
        candidate_recalls = candidate_recalls[pd.Series(list(zip(*product_match))[1]).values > threshold]
        if candidate_recalls.empty:
            return 0
        else:
            reported_specs = [report[spec].lower() 
                              if not pd.isnull(report[spec]) else '' 
                              for spec in ['Model Name or Number', 'Serial Number', 'UPC']]
                        
            reported_spec_matches = candidate_recalls['specifiers'].apply(
                lambda x: any([spec.lower() in report_spec.lower() for spec in x for report_spec in reported_specs]))
            
            definite_match = candidate_recalls['RecallID'][reported_spec_matches]
                        
            if len(definite_match) > 1:
                warnings.warn('More than one "unique" match found')
                return definite_match.min()
            if len(definite_match) > 0:
                return definite_match.values[0]
            else:
                bag_specifier_matches = candidate_recalls['specifiers'].apply(
                lambda x: any([spec.lower() in report_spec.lower() 
                               for spec in x 
                               for report_spec in report['specifiers']]))
                bag_matches = candidate_recalls['RecallID'][bag_specifier_matches]
                if len(bag_matches) > 1:
                    warnings.warn('More than one "unique" match found')
                    return bag_matches.min()
                return bag_matches.values[0] if bag_specifier_matches.sum() else 0

In [196]:
#Perform a fuzzy-character match and a word-based match.
#If either the fuzzy string match passes a given threshold
#or the word-based match finds words in common, return True.
def fuzzy_match(reference_string, comparison_strings, threshold=80):
    #Fuzzy string match
    if pd.isnull(reference_string) or not reference_string:
        return 0
    comparison_strings = [string for string in comparison_strings if not pd.isnull(string)]
    if not comparison_strings:
        return 0
    fuzzy_match = (pd.Series(list(zip(*process.extract(reference_string, comparison_strings,limit=len(comparison_strings),
                                                        scorer = fuzz.token_set_ratio)))[1]) > threshold).any()
    #Whole-word match
    stemmer = SnowballStemmer("english")
    comp_words = set([stemmer.stem(word) for word in list(chain(*[word.split(' ') for word in comparison_strings]))])
    ref_words = set([stemmer.stem(word) for word in reference_string.split(' ')])
    common_word_match = len(ref_words.intersection(comp_words)) > 0
    return fuzzy_match or common_word_match

#Provided 
def matches_on_field(reference_string, search_set, comparison_column, threshold = 80):
    candidate_ids = []
    if pd.isnull(reference_string) or not reference_string:
        return([])
    else:
        candidate_ids = [search_set.iloc[r].loc['RecallID'] for r in range(len(search_set)) 
                         if fuzzy_match(reference_string, search_set.iloc[r].loc[comparison_column])]
    return candidate_ids

def find_matches(reports, recalls):  
    matches_by_product = {product_category: matches_on_field(product_category, recalls, 'clean_product_candidates') 
                          for product_category in list(set(reports['cleaned_product_category']))}
    return(matches_by_product)

In [199]:
#batches = 
product_matches = find_matches(reports[0:1], recalls[0:100])
pm = pd.DataFrame(product_matches)
pm.to_csv('product_category_matches.csv')

In [195]:
rep = reports.iloc[0:1, :]
rec = recalls[recalls['Products_Name'].str.contains('computer', case=False, na=False)]
find_matches(rep, rec)

{'computers equipment and electronic games': [8593,
  8504,
  8253,
  8226,
  7974,
  7960,
  7946,
  7883,
  7772,
  6688,
  6684,
  6649,
  6521,
  6465,
  6370,
  1754,
  1741,
  1718,
  6408,
  1416,
  1147,
  1029,
  1001,
  684,
  324,
  278,
  238,
  195,
  100,
  6036,
  4997,
  4427,
  3689,
  3559,
  3558,
  3467,
  3319,
  3301,
  3256,
  2959,
  2883,
  2759,
  2448,
  2431,
  2419,
  2361,
  2352,
  2177,
  2049,
  2039,
  2018,
  1864,
  128,
  468,
  3027]}

In [71]:
reports.head(50)

Unnamed: 0,Report No.,Report Date,Sent to Manufacturer / Importer / Private Labeler,Publication Date,Category of Submitter,Product Description,Product Category,Product Sub Category,Product Type,Product Code,Manufacturer / Importer / Private Labeler Name,Brand,Model Name or Number,Serial Number,UPC,Date Manufactured,Manufacturer Date Code,Retailer,Retailer State,Purchase Date,Purchase Date Is Estimate,Incident Description,City,State,ZIP,Location,(Primary) Victim Severity,(Primary) Victim's Gender,My Relation To The (Primary) Victim,(Primary) Victim's Age (years),Submitter Has Product,Product Was Damaged Before Incident,Damage Description,Damage Repaired,Product Was Modified Before Incident,Have You Contacted The Manufacturer,If Not Do You Plan To,Answer Explanation,Company Comments,Associated Report Numbers
0,20191204-EBF70-2147376194,12/4/2019,12/30/2019,1/13/2020,Consumer,ERT Bluebird SF550 Mobile computer and charger,Electronics,"Computers, Monitors, Projectors",Computers (Equipment and Electronic Games) (557),557,Bluebird USA Inc.,Bluebird,SF550,SF550A4LAISHGQ821,,,,provided by medical research company Advanced ...,Idaho,11/1/2019,,My moms house caught fire at 2 AM November 28t...,Nampa,Idaho,83686,Home/Apartment/Condominium,"Incident, No Injury",Female,My Parent,78,No,No,,,No,No,,,,
1,20191121-F50B8-2147376424,11/21/2019,12/19/2019,1/13/2020,Consumer,Black mold,Kitchen,Cookware & Tableware,Tableware & Accessories (excluding drinking gl...,474,"Zak Designs, Inc.",,,,,,,,,,,The product is uses to drink out of for kids i...,,Unspecified,,Other,"Incident, No Injury",Unspecified,Self,,No,,,,,Yes,,They really didn't have much to say for me to ...,,
2,20191123-CC04B-2147376395,11/23/2019,12/19/2019,1/13/2020,Consumer,Daniel Tiger's Neighborhood Friend Daniel Tige...,Toys & Children,Toys,"Dolls, Plush Toys, and Action Figures (1394)",1394,"JAKKS Pacific, Inc.",Daniel Tiger's Neighborhood Friend Daniel Tige...,,,,,,Target,,11/20/2015,,My 25-month-old son had a soft plush Daniel Ti...,Harleysville,Pennsylvania,19438,Home/Apartment/Condominium,"Incident, No Injury",Male,My Child,2,Yes,No,,,No,Yes,,I emailed the manufacturer that I found on the...,,
3,20191204-94835-2147376198,12/4/2019,12/27/2019,1/10/2020,Consumer,The beige Malm dresser recalled by IKEA. 4 ful...,"Furniture, Furnishings & Decorations",Furniture,"Desks, Chests, Bureaus or Buffets (604)",604,"IKEA North America Services, LLC",MALM,12882,,,3/1/2009,,IKEA Frisco,Texas,3/4/2009,Yes,Our son had turned 3 a few months prior to the...,Hickory Creek,Texas,75065,Home/Apartment/Condominium,"Incident, No Injury",Male,My Child,3,Yes,No,,,No,No,,We still have the product but have taken it ap...,"(12/31/2019) IKEA North America Services, LLC:...",
4,20191204-0A592-2147376200,12/4/2019,12/27/2019,1/10/2020,Consumer,7.5 ft Uptown LED Pre-Lit Tree with 700 color ...,"Furniture, Furnishings & Decorations",Seasonal Decorations,Artificial Christmas Trees (1701),1701,The Home Depot,Home Accents Holiday,W14N0126,,'3053903870,5/1/2019,05012019_2003186,Home Depot,,11/2/2019,,christmas tree foot switch overheated and melt...,McKinney,Texas,75069,Home/Apartment/Condominium,"Incident, No Injury",Female,My Parent,72,Yes,No,,,No,No,Yes,,(1/2/2020) The Home Depot: Home Depot takes pr...,
5,20191202-C4385-2147376238,12/2/2019,12/27/2019,1/10/2020,Consumer,25 Pack of plug and play (or so it was adverti...,"Furniture, Furnishings & Decorations",Indoor Lighting,Light Bulbs (627),627,Viva Global LLC,Life LED,QT8-120-22W,,,,,EBAY,,2/13/2018,,"On 2/13/18, I purchased a 25 Pack of plug and ...",Felton,California,95018,Other,"Incident, No Injury",Male,Unspecified,,Yes,No,,,No,Yes,,email response - Sorry to hear that but this i...,,
6,20191204-FA85B-2147376205,12/4/2019,12/27/2019,1/10/2020,Consumer,Ravin Crossbow R9,Sports and Recreation,Other,"Archery (Activity, Apparel or Equipment) (1235)",1235,Ravin Crossbows,Ravin Crossbows,R9,,,,2017,Lancaster Archery Supply,Pennsylvania,8/19/2017,,Consumer was using a Ravin R9 crossbow with th...,Chest Springs,Pennsylvania,16624,Home/Apartment/Condominium,"Injury, Emergency Department Treatment Received",Male,"My Client, Patient, Student, Etc. (professiona...",49,Yes,No,,,No,Yes,,,,
7,20191129-DCD12-2147376302,12/4/2019,12/27/2019,1/10/2020,Consumer,Specifications:\r\nModel Number:\tHH1012T\r\nS...,Home Maintenance and Structures,"Heating, Ventilation & Air Conditioning",Portable Electric Heaters (348),348,"eheat, Inc.",ENVI,HH1012T,328553,,9/3/2015,QC/2015,EHEAT,,12/17/2015,,Model Number:\tHH1012T\r\nSerial Number: 3285...,Chester,Massachusetts,01011,Home/Apartment/Condominium,"Incident, No Injury",Unknown,Self,,Yes,No,,,No,Yes,,THIS PRODUCT WAS STILL IN USE AS OF THE DATE ...,,
8,20191202-183F6-2147376256,12/2/2019,12/27/2019,1/10/2020,Consumer,Crock Pot\r\nHas New England Patriots logo on it,Kitchen,Appliances,Slow Cookers (268),268,"Sunbeam Products, Inc. d/b/a Jarden Consumer ...",Crock-Pot,SCCPNFL600,don't know,'48894 05733,,don't know,"don't know, was raffle prize",,1/1/2018,Yes,"As our crock pot was cooling off, I heard a lo...",Waltham,Massachusetts,02451,Home/Apartment/Condominium,"Incident, No Injury",Unknown,Self,,Yes,No,,,No,Yes,,"We're hoping for a new lid, so we haven't toss...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [30]:
recalls.head()

Unnamed: 0,RecallID,RecallNumber,RecallDate,Description,URL,Title,ConsumerContact,LastPublishDate,Images,SoldAtLabel,Distributors_CompanyID,Distributors_Name,Hazards_HazardType,Hazards_HazardTypeID,Hazards_Name,Importers_CompanyID,Importers_Name,Inconjunctions_URL,Injuries_Name,ManufacturerCountries_Country,Manufacturers_CompanyID,Manufacturers_Name,ProductUPCs_UPC,Products_CategoryID,Products_Description,Products_Model,Products_Name,Products_NumberOfUnits,Products_Type,Remedies_Name,RemedyOptions_Option,Retailers_CompanyID,Retailers_Name
0,8726,20056,2020-01-14T00:00:00,This recall involves Thompson’s WaterSeal wate...,https://www.cpsc.gov/Recalls/2020/The-Thompson...,The Thompson’s Company Recalls Aerosol Waterpr...,The Thompson’s Company toll-free at 888-304-37...,2020-01-14T00:00:00,[{'URL': 'https://www.cpsc.gov/s3fs-public/Cap...,,,,,,The contents of the cans can react with the pa...,,,,The Thompson’s Company received approximately ...,United States,,"The Thompson’s Company, of Cleveland, Ohio",,,,,Thompson’s® WaterSeal® Waterproofing Wood Prot...,"About 852,000",,Consumers should immediately stop using Thomps...,Refund,,"Lowe’s Home Improvement, Walmart, Menards, Hom..."
1,8723,20050,2020-01-09T00:00:00,This recall involves Holiday Travel mugs with ...,https://www.cpsc.gov/Recalls/2020/Boston-Wareh...,Boston Warehouse Trading Corp. Recalls Holiday...,Boston Warehouse Trading Corp. toll-free at 88...,2020-01-09T00:00:00,[{'URL': 'https://www.cpsc.gov/s3fs-public/1_4...,,,,,,The mugs are mislabeled as microwave safe. If ...,,"Meijer Distribution Inc., of Grand Rapids, Mich.",,The firm has received one report of sparks whe...,China,,"Boston Warehouse Trading Corp., of Norwood, Mass.",,,,,Holiday Travel Mugs,"About 2,400",,Consumers should immediately stop using the re...,Refund,,Exclusively at Meijer stores nationwide from O...
2,8724,20051,2020-01-09T00:00:00,This recall involves the Libbey Glass 33.5 oz....,https://www.cpsc.gov/Recalls/2020/Libbey-Glass...,Libbey Glass Recalls Milk Bottles Due to Lacer...,Libbey Glass at 800-982-7063 between 8 a.m. an...,2020-01-09T00:00:00,[{'URL': 'https://www.cpsc.gov/s3fs-public/1_4...,,,,,,"The bottles can break unexpectedly during use,...",,"Libbey Glass Inc., of Toledo, Ohio",,None Reported,China,,,,,,,33.5 oz. Milk Bottles,"About 44,300",,Foodservice establishments and customers shoul...,Refund,,Libby sold the recalled bottles to various foo...
3,8725,20712,2020-01-09T00:00:00,This recall involves Little Bambino 4 in 1 can...,https://www.cpsc.gov/Recalls/2020/Thesaurus-Gl...,Thesaurus Global Marketing Recalls Tricycles D...,Little Bambino toll-free at 866-633-8202 from ...,2020-01-09T00:00:00,[{'URL': 'https://www.cpsc.gov/s3fs-public/1_4...,,,,,,Paint on the canopy’s frame contains levels of...,,"Thesaurus Global Marketing Inc., of Doral, FL",,None reported,China,,,,,,,Little Bambino 4 in 1 canopy children’s tricycles,About 370,,Consumers should immediately stop using the re...,Refund,,Amazon.com from October 2018 through June 2019...
4,8721,20046,2019-12-20T00:00:00,This recall involves Buderus brand GB125-35 oi...,https://www.cpsc.gov/Recalls/2020/Bosch-Thermo...,Bosch Thermotechnology Recalls Buderus Boilers...,Bosch Thermotechnology at 800-323-1943 from 8 ...,2019-12-20T00:00:00,[{'URL': 'https://www.cpsc.gov/s3fs-public/Scr...,,,"Bosch Thermotechnology Corp., of Watertown, Mass.",,,"The siphon can become blocked, leading to a de...",,"Bosch Thermotechnology Corp., of Watertown, Mass.",,None reported in the U.S.,Germany,,"Bosch Thermotechnik GmbH, of Germany",,,,,Buderus GB125-35 oil-condensing boilers,About 170,,Consumers should immediately contact Bosch for...,Repair,,Wholesale distributors and installed by indepe...


In [497]:
def get_matched_brands(brand, reports, recalls):
    brand_reports = reports[reports['Manufacturer / Importer / Private Labeler Name'].str.contains(brand, case=False) 
                            | reports['Brand'].str.contains(brand, case=False)
                            | reports['Incident Description'].str.contains(brand, case=False)]
    brand_recalls = recalls[recalls['CompanyShortname'].str.contains(brand, case=False) 
                            | recalls['Manufacturers_Name'].str.contains(brand, case=False) 
                            | recalls['Importers_Name'].str.contains(brand, case=False) 
                            | recalls['Distributors_Name'].str.contains(brand, case=False) 
                            | recalls['Retailers_Name'].str.contains(brand, case=False)]
    return (brand_reports, brand_recalls)


In [574]:
reports, recalls = prepare_fields(reports, recalls)

In [576]:
reports['labels'] = reports.parallel_apply(lambda row: find_match(row, recalls, threshold=80), axis=1)

KeyboardInterrupt: 