In [137]:
import pandas as pd
import numpy as np
import nltk, datetime, re
from nltk.corpus import stopwords

In [125]:
#Run notebook in conda env lexnlp
import lexnlp.extract.en.definitions
import lexnlp.extract.en.amounts
import lexnlp.extract.en.regulations



In [3]:
pd.set_option('display.max_columns', 999)

In [234]:
recalls = pd.read_csv('recalls.csv', 
                      dtype={'RecallID':'Int64', 'RecallNumber': str, 'RecallDate': str,
                             'Description': str, 'URL': str, 'Title': str, 'ConsumerContact': str,
                             'LastPublishDate': str, 'Images': 'object', 'SoldAtLabel': str,
                             'Distributors_CompanyID': 'Int64', 'DistributorsSName': str, 
                             'Hazards_HazardType': str, 'Hazards_HazardTypeID': str, 'Hazards_Name': str,
                             'Importers_CompanyID': 'Int64', 'Importers_Name': str, 'Inconjunctions_URL': str,
                             'Injuries_Name': str, 'ManufacturerCountries_Country': str, 
                             'Manufacturers_CompanyID': 'Int64', 'Manufacturers_Name': str, 'ProductUPCs_UPC': str,
                             'Products_CategoryID': 'Int64', 'Products_Description': str, 'Products_Model': str,
                             'Products_Name': str, 'Products_NumberOfUnits': str, 'Products_Type': str,
                             'Remedies_Name': str, 'RemedyOptions_Option': str, 'Retailers_CompanyID': str,
                             'Retailers_Name': str})

In [239]:
#This works reasonably well; seems to have low false-positives, but will also miss
#codes embedded in unusual sentences like "the UPC code for this product is XXXXX"
def identify_probable_upc(text):
    try:
        tk = nltk.word_tokenize(text)
    except:
        tk = nltk.word_tokenize("")
    tk = [token for token in tk if token not in ["’", ',', '.', '”', '“', 's']]
    pos = nltk.pos_tag(tk)
    grammar = "UPC: {<NN|NNP>+<:>*<VBD|VBZ>*<CD>+<CC>?<CD>?}"
    parser = nltk.RegexpParser(grammar)
    candidate_UPCs = [tree.leaves() for tree in parser.parse(pos).subtrees() if tree.label() in ['UPC']]
    return candidate_UPCs

def validate_upc(candidates):
    upcs = []
    dq_pattern = re.compile('[a-zA-Z\.\/]')
    for c in candidates:
        upcs += [re.sub('-', '', p[0]) for p in c if p[1] == 'CD' 
                 and len(re.sub('-', '', p[0])) in [8, 12, 13] 
                 and dq_pattern.search(p[0]) is None]
    return upcs

In [257]:
#Munge and infill whatever information we can extract from the recall descriptions

#UPCs loaded as strings; clean for whitespace & non-numeric chars
provided_upcs = recalls['ProductUPCs_UPC'].str.replace(' |-|\.', '')
#extract UPCs from description; add where empty;
inferred_upcs = [validate_upc(identify_probable_upc(d)) for d in recalls['Description']]
merged_upcs = [inferred_upcs[i] + [provided_upcs[i]] for i in range(len(inferred_upcs))]
recalls['ProductUPCs_UPC'] = merged_upcs
recalls.rename({'ProductUPCs_UPC': 'UPC'})
#Extract unit numbers from string phrases (e.g. "About 35")
num_units = recalls['Products_NumberOfUnits'].str.replace(',', '')
num_units = num_units.str.extract(r'(\d+)', expand=False).astype('float')
recalls['Products_NumberOfUnits'] = num_units
#Parse dates from strings
recalls['RecallDate'] = pd.to_datetime(recalls['RecallDate'])
recalls['LastPublishDate'] = pd.to_datetime(recalls['LastPublishDate'])
#Break the standardized titles into helpful fields; standardized
titles take the form "[Company] recalls [product] due to [hazard]"
titles = recalls['Title'].str.split('Recall[s]|Due to', expand=True)
titles = titles.rename(columns={0: 'CompanyShortname', 1: 'ProductsShortname', 2: 'HazardAlt'})
recalls = pd.concat([recalls, titles], axis=1)

In [None]:
#ItemNumber: {<NNP><:|\#>*<JJ>}
samples = recalls['Description'].sample(n=100)
candidates = [identify_probable_upc(s) for s in samples]
second_pass = [validate_upc(c) for c in candidates]

In [58]:
has_model = reports[~pd.isna(reports['Model Name or Number'])]
len(has_model)
has_model.head()

Unnamed: 0,Report No.,Report Date,Sent to Manufacturer / Importer / Private Labeler,Publication Date,Category of Submitter,Product Description,Product Category,Product Sub Category,Product Type,Product Code,Manufacturer / Importer / Private Labeler Name,Brand,Model Name or Number,Serial Number,UPC,Date Manufactured,Manufacturer Date Code,Retailer,Retailer State,Purchase Date,Purchase Date Is Estimate,Incident Description,City,State,ZIP,Location,(Primary) Victim Severity,(Primary) Victim's Gender,My Relation To The (Primary) Victim,(Primary) Victim's Age (years),Submitter Has Product,Product Was Damaged Before Incident,Damage Description,Damage Repaired,Product Was Modified Before Incident,Have You Contacted The Manufacturer,If Not Do You Plan To,Answer Explanation,Company Comments,Associated Report Numbers
0,20191204-EBF70-2147376194,12/4/2019,12/30/2019,1/13/2020,Consumer,ERT Bluebird SF550 Mobile computer and charger,Electronics,"Computers, Monitors, Projectors",Computers (Equipment and Electronic Games) (557),557,Bluebird USA Inc.,Bluebird,SF550,SF550A4LAISHGQ821,,,,provided by medical research company Advanced ...,Idaho,11/1/2019,,My moms house caught fire at 2 AM November 28t...,Nampa,Idaho,83686,Home/Apartment/Condominium,"Incident, No Injury",Female,My Parent,78.0,No,No,,,No,No,,,,
3,20191204-94835-2147376198,12/4/2019,12/27/2019,1/10/2020,Consumer,The beige Malm dresser recalled by IKEA. 4 ful...,"Furniture, Furnishings & Decorations",Furniture,"Desks, Chests, Bureaus or Buffets (604)",604,"IKEA North America Services, LLC",MALM,12882,,,3/1/2009,,IKEA Frisco,Texas,3/4/2009,Yes,Our son had turned 3 a few months prior to the...,Hickory Creek,Texas,75065,Home/Apartment/Condominium,"Incident, No Injury",Male,My Child,3.0,Yes,No,,,No,No,,We still have the product but have taken it ap...,"(12/31/2019) IKEA North America Services, LLC:...",
4,20191204-0A592-2147376200,12/4/2019,12/27/2019,1/10/2020,Consumer,7.5 ft Uptown LED Pre-Lit Tree with 700 color ...,"Furniture, Furnishings & Decorations",Seasonal Decorations,Artificial Christmas Trees (1701),1701,The Home Depot,Home Accents Holiday,W14N0126,,'3053903870,5/1/2019,05012019_2003186,Home Depot,,11/2/2019,,christmas tree foot switch overheated and melt...,McKinney,Texas,75069,Home/Apartment/Condominium,"Incident, No Injury",Female,My Parent,72.0,Yes,No,,,No,No,Yes,,(1/2/2020) The Home Depot: Home Depot takes pr...,
5,20191202-C4385-2147376238,12/2/2019,12/27/2019,1/10/2020,Consumer,25 Pack of plug and play (or so it was adverti...,"Furniture, Furnishings & Decorations",Indoor Lighting,Light Bulbs (627),627,Viva Global LLC,Life LED,QT8-120-22W,,,,,EBAY,,2/13/2018,,"On 2/13/18, I purchased a 25 Pack of plug and ...",Felton,California,95018,Other,"Incident, No Injury",Male,Unspecified,,Yes,No,,,No,Yes,,email response - Sorry to hear that but this i...,,
6,20191204-FA85B-2147376205,12/4/2019,12/27/2019,1/10/2020,Consumer,Ravin Crossbow R9,Sports and Recreation,Other,"Archery (Activity, Apparel or Equipment) (1235)",1235,Ravin Crossbows,Ravin Crossbows,R9,,,,2017,Lancaster Archery Supply,Pennsylvania,8/19/2017,,Consumer was using a Ravin R9 crossbow with th...,Chest Springs,Pennsylvania,16624,Home/Apartment/Condominium,"Injury, Emergency Department Treatment Received",Male,"My Client, Patient, Student, Etc. (professiona...",49.0,Yes,No,,,No,Yes,,,,


In [260]:
reports = pd.read_csv('SPDB/IncidentReports.csv', encoding="ISO-8859-1", dtype='object')

In [117]:
reports.columns

Index(['Report No.', 'Report Date',
       'Sent to Manufacturer / Importer / Private Labeler', 'Publication Date',
       'Category of Submitter', 'Product Description', 'Product Category',
       'Product Sub Category', 'Product Type', 'Product Code',
       'Manufacturer / Importer / Private Labeler Name', 'Brand',
       'Model Name or Number', 'Serial Number', 'UPC', 'Date Manufactured',
       'Manufacturer Date Code', 'Retailer', 'Retailer State', 'Purchase Date',
       'Purchase Date Is Estimate', 'Incident Description', 'City', 'State',
       'ZIP', 'Location', '(Primary) Victim Severity',
       '(Primary) Victim's Gender', 'My Relation To The (Primary) Victim',
       '(Primary) Victim's Age (years)', 'Submitter Has Product',
       'Product Was Damaged Before Incident', 'Damage Description',
       'Damage Repaired', 'Product Was Modified Before Incident',
       'Have You Contacted The Manufacturer', 'If Not Do You Plan To',
       'Answer Explanation', 'Company Comments

In [261]:
len(reports)

41542

In [265]:
len(recalls[pd.to_datetime(recalls['RecallDate']) > min(pd.to_datetime(reports['Report Date']))])

2535

In [270]:
min(pd.to_datetime(reports['Report Date']))

Timestamp('2011-03-11 00:00:00')

In [271]:
reports.tail()

Unnamed: 0,Report No.,Report Date,Sent to Manufacturer / Importer / Private Labeler,Publication Date,Category of Submitter,Product Description,Product Category,Product Sub Category,Product Type,Product Code,Manufacturer / Importer / Private Labeler Name,Brand,Model Name or Number,Serial Number,UPC,Date Manufactured,Manufacturer Date Code,Retailer,Retailer State,Purchase Date,Purchase Date Is Estimate,Incident Description,City,State,ZIP,Location,(Primary) Victim Severity,(Primary) Victim's Gender,My Relation To The (Primary) Victim,(Primary) Victim's Age (years),Submitter Has Product,Product Was Damaged Before Incident,Damage Description,Damage Repaired,Product Was Modified Before Incident,Have You Contacted The Manufacturer,If Not Do You Plan To,Answer Explanation,Company Comments,Associated Report Numbers
41537,20110311-DBB63-2147481650,3/11/2011,,4/1/2011,Consumer,"Pampers Swaddlers New Baby with Dry Max, Size 1-2",Baby,Nursery Equipment & Supplies,Diapers (1512),1512,The Procter & Gamble Company,Pampers,"Swaddlers New Baby with Dry Max, Size 1-2",0363U017630008,,,,Sam's Club,Indiana,3/1/2011,,"Since he was born two months ago, we have been...",Lebanon,Indiana,46052.0,Home/Apartment/Condominium,"Injury, First Aid Received by Non-Medical Prof...",Male,My Child,0.0,No,No,,,No,Yes,,I will be writing a letter to the company foll...,(4/1/2011) The Procter & Gamble Company: Thank...,
41538,20110311-E518D-1170349,3/11/2011,,4/1/2011,Consumer,Airzone Pop stick. Bright/ lime green with bla...,Toys & Children,Toys,Pogo Sticks (1310),1310,BRAVO SPORTS,Airzone,Pop stick,unknown,,9/24/2010,054CH,Walmart Stores Inc.,Arkansas,2/24/2011,Yes,My nine year old daughter was using her Airzon...,Santa Maria,California,93455.0,Home/Apartment/Condominium,"Injury, First Aid Received by Non-Medical Prof...",Female,My Child,9.0,Yes,No,,No,No,No,No,,(3/31/2011) BRAVO SPORTS: Thank you for transm...,
41539,20110313-2E80A-2147481520,3/13/2011,,4/1/2011,Consumer,Beige colored dog toy which has a bone shaped ...,Toys & Children,Toys,"Dolls, Plush Toys, and Action Figures (1394)",1394,Target,Target,087011014,F16805009,,5/1/2009,05/09,Gift from Target,California,,,"The cord which attaches the ""remote"" to the wa...",Moraga,California,94556.0,Home/Apartment/Condominium,"Incident, No Injury",Female,My Child,4.0,Yes,No,,,No,No,No,,,
41540,20110313-26730-2147481529,3/13/2011,,4/1/2011,Consumer,Toro CCR 3650 Snowblower,Yard & Garden,Gardening & Landscaping,"Snow Throwers, Blowers (1406)",1406,The Toro Company,Toro,38518,220008051,,,Not Known,Home Depot,Ohio,2/1/2003,Yes,"A few weeks ago, we started experiencing a ver...",,United States,,Home/Apartment/Condominium,"No Incident, No Injury",Unspecified,,,Yes,No,,,No,No,Yes,I plan on contacting a Toro authorized service...,,
41541,20110311-6B7FC-2147481644,3/11/2011,,4/1/2011,Consumer,Harman Oakwood Freestanding wood stove. Manufa...,Home Maintenance and Structures,"Heating, Ventilation & Air Conditioning",Coal or Wood-burning Stoves (367),367,HARMAN STOVE COMPANY,Harman,Oakwood,,,2/1/2010,,Wrights Chimney and Stove,North Carolina,6/1/2010,Yes,I own a Harman Oakwood freestanding wood burni...,,United States,,Home/Apartment/Condominium,"No Incident, No Injury",Unspecified,,,Yes,No,,,No,No,No,The stove weighs 500 pounds. It isn't going an...,,
