In [5]:
import pandas as pd
import numpy as np
import datetime, re, warnings, string, os, spacy
from fuzzywuzzy import fuzz, process
from pandarallel import pandarallel
from itertools import chain
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [10]:
pd.set_option('display.max_columns', 0)
pd.set_option('display.max_rows', 0)
pd.set_option('expand_frame_repr', False)

In [6]:
reports = pd.read_csv('SPDB/IncidentReports.csv', encoding="ISO-8859-1", dtype='object')
recalls = pd.read_csv('recalls.csv', 
                      dtype={'RecallID':'Int64', 'RecallNumber': str, 'RecallDate': str,
                             'Description': str, 'URL': str, 'Title': str, 'ConsumerContact': str,
                             'LastPublishDate': str, 'Images': 'object', 'SoldAtLabel': str,
                             'Distributors_CompanyID': 'Int64', 'DistributorsSName': str, 
                             'Hazards_HazardType': str, 'Hazards_HazardTypeID': str, 'Hazards_Name': str,
                             'Importers_CompanyID': 'Int64', 'Importers_Name': str, 'Inconjunctions_URL': str,
                             'Injuries_Name': str, 'ManufacturerCountries_Country': str, 
                             'Manufacturers_CompanyID': 'Int64', 'Manufacturers_Name': str, 'ProductUPCs_UPC': str,
                             'Products_CategoryID': 'Int64', 'Products_Description': str, 'Products_Model': str,
                             'Products_Name': str, 'Products_NumberOfUnits': str, 'Products_Type': str,
                             'Remedies_Name': str, 'RemedyOptions_Option': str, 'Retailers_CompanyID': str,
                             'Retailers_Name': str})

#Munge and infill whatever information we can extract from the recall descriptions

##UPCs loaded as strings; clean for whitespace & non-numeric chars
recalls['ProductUPCs_UPC'] = recalls['ProductUPCs_UPC'].str.replace(' |-|\.', '')
recalls = recalls.rename(columns={'ProductUPCs_UPC': 'UPC'})
#Extract unit numbers from string phrases (e.g. "About 35")
num_units = recalls['Products_NumberOfUnits'].str.replace(',', '')
num_units = num_units.str.extract(r'(\d+)', expand=False).astype('float')
recalls['Products_NumberOfUnits'] = num_units
#Extract total number of complaints from the string column
#Parse dates from strings
recalls['RecallDate'] = pd.to_datetime(recalls['RecallDate'])
recalls['LastPublishDate'] = pd.to_datetime(recalls['LastPublishDate'])
#Break the standardized titles into helpful fields; standardized
#titles take the form "[Company] recalls [product] due to [hazard]"
titles = recalls['Title'].str.split('[Re]?[A]?[a]?[nnounce]?[s]?Recall[s]?[ed]?|Due to', expand=True)
titles = titles.rename(columns={0: 'CompanyShortname', 1: 'ProductsShortname', 2: 'HazardAlt'})
recalls = pd.concat([recalls, titles], axis=1)
recalls = recalls[recalls['RecallDate'] > pd.to_datetime('01/01/2000')]

In [5]:
#Extract any sequence of characters likely to be a specifier such as
#a model number or serial number from a string.
def extract_probable_specifiers(text):
    if pd.isnull(text):
        text=''
    pattern = r"(([0-9A-Z])+[a-z]*([\\-]?[\\.*]?[0-9A-Z]*)*){2,}"
    matches = re.finditer(pattern, text)
    unique_matches = set([match.group() for matchNum, match in enumerate(matches)])
    return [match.lower() for match in list(unique_matches)]

#Given a list of strings, remove non-alphabetic characters,
#tokenize each string, remove the provided excluded words,
#and stem. Return a list of lists of cleaned tokens.
def clean_list(str_list, excluded_words):
    if isinstance(str_list, str):
        str_list = [str_list]
    if not str_list:
        return []
    stemmer = SnowballStemmer("english")
    pattern = re.compile('[^a-z]')
    tokens = []
    tokens = list(chain(*[i.split(' ') for i in str_list if not pd.isnull(i)]))
    cleaned_tokens = []
    for token in tokens:
        token = token.lower()
        if not re.match(pattern, token):
            token = pattern.sub('', token)
            token = stemmer.stem(token)
            if token in excluded_words or not token:
                continue
            cleaned_tokens.append(token)
    return list(set(cleaned_tokens))

#Construct a list of tokens to exclude from the cleaned strings
#and tokenize the provided column.
def clean_candidates(df, info_columns, resulting_category):
    states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
      "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
      "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
      "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
      "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", "USA",
      "Alabama","Alaska","Arizona","Arkansas","California","Colorado",
      "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
      "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
      "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
      "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
      "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
      "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
      "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
    states = [state.lower() for state in states]
    companies = ['inc', 'llc', 'corp', 'corporation', 'co', 'ltd', 'company', 'international', 'consolidated', 
                'incorporated', ]
    excluded_words = states + companies + stopwords.words('english')
    raw_info = pd.Series(df.loc[:, info_columns].fillna('').values.tolist())
    cleaned_tokens = raw_info.apply(clean_list, excluded_words=excluded_words)
    df['clean_' + resulting_category] = cleaned_tokens
    return df

In [6]:
#Clean and extract products & brands
reports = clean_candidates(reports, ('Product Type', 'Product Description'), 'product')
reports = clean_candidates(reports, ('Manufacturer / Importer / Private Labeler Name'), 'brand')
reports['brand_key'] = reports['clean_brand'].apply(lambda x: '_'.join(sorted(x)))
recalls = clean_candidates(recalls, ('Products_Name', 'ProductsShortname'), 'product')
recalls = clean_candidates(recalls, ('Manufacturers_Name', 'Importers_Name', 'Distributors_Name', 'CompanyShortname'), 'brand')

#Extract specifiers
reports['specifiers'] = [extract_probable_specifiers(report) for report in 
                         [reports['Product Description'] + ' ' + reports['Incident Description']][0]]

recalls['specifiers'] = [extract_probable_specifiers(recall) for recall in recalls['Description']]

In [3]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"This recall involves Sears Kenmore stainless steel slide-in ranges with gas cooktops and electric ovens. Model number 790.42603xxx with serial numbers ranging from AF42500601 through AF43000916 and model number 790.42613xxx with serial numbers ranging from AF42500541 through AF43103647 are included.")
doc.ents

(Sears, 790.42603xxx, 790.42613xxx, AF42500541)

In [7]:
#Perform a fuzzy-character match and a word-based match.
#If either the fuzzy string match passes a given threshold
#or the word-based match finds words in common, return True.
def fuzzy_match(reference_strings, comparison_strings, threshold=80):
    #Fuzzy string match
    comparison_strings = [string for string in comparison_strings if not pd.isnull(string)]
    if not comparison_strings:
        return 0
    fuzzy_match = (pd.Series(list(zip(*process.extract(' '.join(reference_strings), comparison_strings,
                                                       limit=len(comparison_strings),
                                                     scorer = fuzz.token_set_ratio)))[1]) > threshold).any()
    #Whole-word match
    comp_words = set(comparison_strings)
    ref_words = set(reference_strings)
    common_word_match = len(ref_words.intersection(comp_words)) > 0
    return fuzzy_match and common_word_match

#Compare a reference string to all of the strings in the comparison column of
#a dataframe.
def matches_on_field(reference_string, search_set, comparison_column, threshold = 80):
    if isinstance(reference_string, pd.core.series.Series):
        reference_string = reference_string.values[0]
    candidate_ids = []
    if pd.isnull(reference_string) or not reference_string:
        return([])
    else:
        candidate_ids = [search_set.iloc[r].loc['RecallID'] for r in range(len(search_set)) 
                       if fuzzy_match(reference_string.split('_'), search_set.iloc[r].loc[comparison_column])]
    return candidate_ids

#Look for matches between reports and candidate recalls based on a column.
def match_candidates(matching_reports, key_column, recalls, recall_candidate_column, label):
    label_key = matching_reports.iloc[0].loc[key_column]
    candidate_recall_ids = matching_reports.iloc[0].loc[recall_candidate_column]
    if not label_key or not isinstance(candidate_recall_ids, list):
        matching_reports[label+'_ids'] = np.array(len(matching_reports))
        return(matching_reports)
    candidate_recalls = recalls[recalls['RecallID'].apply(lambda x: x in candidate_recall_ids)]
    matching_reports[label+'_ids'] = matching_reports['clean_'+label].apply(lambda x: matches_on_field('_'.join(x), candidate_recalls, 'clean_'+label))
    return(matching_reports)

#Look for a nearly-matching specifier in the candidate recalls from the specifiers in
#the reports.
def match_specifiers(report, recalls, candidate_column, spec_column='specifiers', threshold=80):
    reported_specs = [report[spec].lower() for spec in ['Model Name or Number', 'Serial Number', 'UPC']
                     if not pd.isnull(report[spec])]
    candidates = report[candidate_column]
    if candidates and isinstance(candidates, int):
        candidates = [candidates]
    candidate_recalls = recalls[recalls['RecallID'].apply(lambda x: x in candidates)]
    if len(candidate_recalls) == 0 or len(reported_specs) == 0:
        return 0
    reported_spec_matches = candidate_recalls[spec_column].apply(
       lambda x: any([(pd.Series(list(zip(*process.extract(rs, x, scorer = fuzz.token_set_ratio)))[1]) > threshold).any() 
                  for rs in reported_specs if x]))
    definite_match = candidate_recalls['RecallID'][reported_spec_matches]
    
    if len(definite_match) > 1:
        warnings.warn('More than one "unique" match found')
        return definite_match.min()
    elif len(definite_match) > 0:
        return definite_match.values[0]
    else:
        return 0

In [8]:
#Label the data with the matching recall.
pandarallel.initialize()
brands = pd.Series(sorted(list(set(reports['brand_key']))))
brand_candidates = brands.parallel_apply(matches_on_field, search_set=recalls, comparison_column='clean_brand')
brand_candidates = pd.DataFrame({'brand_key': brands, 'brand_candidates': brand_candidates})
brands = reports.join(brand_candidates.set_index('brand_key'), on='brand_key')
brands['brand_candidates'] = brands['brand_candidates'].fillna(value='')
prod_candidates = brands.groupby('brand_key', as_index=False).parallel_apply(match_candidates, 
                                                                             key_column='brand_key',
                                                                             recalls=recalls,
                                                                             recall_candidate_column='brand_candidates',
                                                                             label='product')
prod_candidates['labels'] = prod_candidates.parallel_apply(match_specifiers, axis=1, recalls=recalls, candidate_column='product_ids')
prod_candidates.to_csv('labeled_data.csv')

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.






































































































































