In [None]:
# Jack Morgan
# CSCI182
# 00001360442

# Install required packages
import sys
!{sys.executable} -m pip install requests
!{sys.executable} -m pip install beautifulsoup4
!{sys.executable} -m pip install apyori
!{sys.executable} -m pip install fuzzywuzzy
!{sys.executable} -m pip install python-Levenshtein


In [32]:
'''
Generates URLs for scraping
Uses the product directory on CPID
Saves links to 'product_links.txt'
'''

import re
import requests
from bs4 import BeautifulSoup

# Generate URLs for every search page
# Search pages are organized by Product Type

# Template URL, filter term is replaced for each letter A-Z (and 0-9)
base_url = "https://www.whatsinproducts.com"
url_template = "https://www.whatsinproducts.com/types/index/TypeLanguage.search:search/TypeLanguage.filter:{}/TypeLanguage.lang_id:1"
search_filters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0-9']

# Request each search page, scrape available product categories and gather individual product URLs
product_categories = list()
product_links = list()
for search_filter in search_filters:
    
    links_on_page = 0
    
    # Make request
    url = url_template.format(search_filter)
    print("Requesting: {}".format(url))
    r = requests.get(url)
    print("Status Code: {}\n".format(r.status_code))
    
    # Ensure page properly responds
    if r.status_code != 200:
        print("\tREQUEST FAILED\n\tURL: {}\n\tSTATUS: {}\n".format(url, r.status_code))
        continue
    
    # Create BS object for page
    soup = BeautifulSoup(r.text)
    
    # Product Type names are held in divs with class "list_category_name evenHead/oddHead"
    for prod_type in soup.find_all("div", {"class": ["list_category_name oddHead", "list_category_name evenHead"]}):
        product_categories.append(prod_type.string.strip())
        
    # Product results are stored in divs with name "result"
    for prod_result in soup.find_all("div", {"name": "result"}):
        
        # Create new soup object from the result
        prod_result_soup = BeautifulSoup(str(prod_result))
        
        # Find links in results
        for link in prod_result_soup.find_all("a"):
            
            # Get href, removing html tags
            prod_link = base_url + link.get("href").replace("<p>", "").replace("</p>", "")
            # Remove all other whitespace
            prod_link = "".join(prod_link.split())
            
            # Remove unneeded links
            # Old or discontinued products will have "color:#a9a9a9" in the href, greying out the link
            # Manufacturer links are not needed at this step
            if "color:#a9a9a9" in prod_link or "manufacturers" in prod_link:
                continue
                
            # Some links are noisy and contain tags
            # Source: https://stackoverflow.com/questions/8784396/how-to-delete-the-words-between-two-delimiters
            if "<" in prod_link:
                prod_link = re.sub(r'<.+?>', '', prod_link)
                
            links_on_page += 1
            product_links.append(prod_link)
    print("{} Links on page {}".format(links_on_page, search_filter))
    
product_links = set(product_links)

print("{} product links found\nWriting to 'product_links.txt'".format(len(product_links)))

with open("product_links.txt", "w") as f:
    for link in product_links:
        f.write(link)
        f.write("\n")

Requesting: https://www.whatsinproducts.com/types/index/TypeLanguage.search:search/TypeLanguage.filter:A/TypeLanguage.lang_id:1


KeyboardInterrupt: 

In [33]:
'''
Creates a file which translates CAS numbers to chemical names
Based on CPID data
Output written to 'cas.csv'
'''

import requests
from bs4 import BeautifulSoup

# Find all chemical names and make a file to translate CAS ID to chemical names

# Template URL, set up in same way as product scraping
base_url = "https://www.whatsinproducts.com/chemicals/index/Chemical.filter:{}/Chemical.lang_id:1"
search_filters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0-9']

# CAS ID -> Chemical Name
cas_dict = dict()
total_cas = 0

# Make requests
for search_filter in search_filters:
    url = base_url.format(search_filter)
    r = requests.get(url)
    print("Requesting '{}', Status {}".format(search_filter, r.status_code))
    
    # Create BS object
    soup = BeautifulSoup(r.text)
    
    for row in soup.find('div', {'class': 'rslt_prd_hldr'}).find_all('tr', {'class': 'odd'}):
        # Get CAS ID, remove hyphens
        cas = row.find('td', {'class': 'brnd_srch_manuf'}).text.replace("-", "")
        try:
            # Remove non-integer values
            if not cas:
                raise TypeError
            cas_int = int(cas)
        except TypeError:
            continue
        cas_dict[cas] = row.find('a').text.replace(',', '').replace('\n', '').strip().lower()
        total_cas += 1
    
# Write to file
with open('cas.csv', 'w') as csvfile:
    csvfile.write("CAS,CHEM\n")
    for key, val in cas_dict.items():
        
        csvfile.write("{},{}\n".format(key, val))
print("Written to cas.csv")
    

Requesting 'A', Status 200
Requesting 'B', Status 200
Requesting 'C', Status 200
Requesting 'D', Status 200
Requesting 'E', Status 200
Requesting 'F', Status 200
Requesting 'G', Status 200
Requesting 'H', Status 200
Requesting 'I', Status 200
Requesting 'J', Status 200
Requesting 'K', Status 200
Requesting 'L', Status 200
Requesting 'M', Status 200
Requesting 'N', Status 200
Requesting 'O', Status 200
Requesting 'P', Status 200
Requesting 'Q', Status 200
Requesting 'R', Status 200
Requesting 'S', Status 200
Requesting 'T', Status 200
Requesting 'U', Status 200
Requesting 'V', Status 200
Requesting 'W', Status 200
Requesting 'X', Status 200
Requesting 'Y', Status 200
Requesting 'Z', Status 200
Requesting '0-9', Status 200
Written to cas.csv


In [34]:
'''
Creates a directory of CAS numbers for harmful substances
Based off NIOSH data
Output written to 'cas_info.csv'
'''

import requests
from bs4 import BeautifulSoup

# Scrape "Pocket Guide" site for CAS numbers considered dangerous

# Make request
r = requests.get("https://www.cdc.gov/niosh/npg/npgdcas.html")

# Create soup object
soup = BeautifulSoup(r.text)

cas = list()
for i, row in enumerate(soup.find_all('tr')):
    if not i:  # Skip header row
        continue
    # Assemble CAS number from string
    # Some strings will have text and hyphens
    try:
        cas_string = [c for c in row.find_all("td")[0].string if c in '0123456789']
    except TypeError:
        continue
    cas_string = ''.join(cas_string)
    chem_name = row.find('a').string.strip().replace(",", "")
    cas.append([cas_string, chem_name])

# Write to file
with open("cas_info.csv", "w") as f:
    for cas_row in cas:
        f.write("{}\n".format(','.join(cas_row)))
        

In [None]:
'''
Generates a CSV file with headers, use before scraping individual product information
Output written to 'prod_details.csv'
'''

import csv

# Create product CSV file

with open("prod_details.csv", "w") as csvfile:
    fieldnames = ['Name', 'Categories', 'Usage', 'Form', 'Manufacturer', 'Chemicals', 'URL']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

In [None]:
'''
Main scraping script
Gathers information on products, read from 'product_links.txt'
Appends specified fields to 'prod_details.csv'
'''

import csv
import requests
from bs4 import BeautifulSoup

# Use product URLs to scrape information for each individual product
# Desired Fields:
# Product Name, Categories (| separated), Usage, Form, Manufacturer, Chemicals (| separated)

# Open remaining product links
urls = list()
with open("product_links.txt", "r") as f:
    for line in f.readlines():
        urls.append(line)

for i, url in enumerate(urls):
    data = {
        "Name": '',
        "Categories": '',
        "Usage": '',
        "Form": '',
        "Manufacturer": '',
        "Chemicals": '',
        "URL": url
    }
    
    # Make request
    # print("-"*40)
    print("URL {}/{}".format(i + 1, len(urls)))
    # print("Making request to {}".format(url))
    r = requests.get(url)
    # print("Status Code: {}".format(r.status_code))
    print("-"*40)
    
    # Catch sites which don't respond with 200
    if r.status_code != 200:
        print("ERROR, SITE WRITTEN TO 'ERRORS.TXT'")
        with open("errors.txt", "a") as e:
            e.write(url)
            e.write("\n")
        continue
    
    # Create BeautifulSoup object
    soup = BeautifulSoup(r.text)
    
    # Find product name
    data["Name"] = soup.find('h1', {'class': 'srch_hd green_hd'}).text.strip().lower().replace(",", "")
    
    # Find product categories from product page
    categories = list()
    for category_div in soup.find_all('div', {"class": "breadcrumbs"}):
        # Split dirty string on double colon delimiter
        # Ex: Home Maintenance :: Additive :: paint
        dirty_categories = str(category_div.text).split('::')
        
        # Iterate over categories, remove L+R whitespace, convert to lowercase and remove commas
        for category in dirty_categories:
            categories.append(category.strip().lower().replace(",", ""))
            
    # Remove duplicates and join on pipe
    categories = "|".join(set(categories))
    data["Categories"] = categories
    
    # Find usage and form data
    for row in soup.find_all('div', {'class': 'brand_clasi_feild'}):
        # Split by newlines
        div_text = row.text.split("\n")
        
        # Iterate through lines
        for r in div_text:
            r = r.strip()
            # Find usage and form categories
            if r.startswith("Usage"):
                r = r.split(":")
                try:
                    data["Usage"] = r[1].strip()
                except IndexError:
                    data["Usage"] = ""
            elif r.startswith("Form"):
                r = r.split(":")
                try:
                    data["Form"] = r[1].strip()
                except IndexError:
                    data["Form"] = ""
                
    # Get manufacturer
    data["Manufacturer"] = soup.find('div', {'class': 'manuf_add'}).find('strong').text.strip()
    
    # Get chemical contents
    chemicals = list()
    for chem in soup.find_all('td', {'class': 'brnd_srch_cas_no'}):
        cas = chem.text.strip().replace("-", "")
        try:
            cas = int(cas)
        except:
            continue
        chemicals.append(str(cas))
    data["Chemicals"] = "|".join(chemicals).strip()
    
    # Write product data to csv
    with open("prod_details.csv", "a") as csvfile:
        fieldnames = ['Name', 'Categories', 'Usage', 'Form', 'Manufacturer', 'Chemicals', 'URL']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(data)
    

In [35]:
'''
Implementation of the Apriori Algorithm on ingredient sets
'''

import csv
from apyori import apriori

# Find frequent chemical itemsets using Apriori Algorithm

# Read prod_details.csv to create list of lists
itemsets = list()
products = dict()
with open("prod_details.csv", "r", newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        products[row['Name']] = row['Chemicals'].lower().split("|")
        itemsets.append(row["Chemicals"].lower().split("|"))
        
# Read CAS data to translate names
total_cas = list()
cas_dict = dict()
with open("cas.csv", "r") as f:
    for i, line in enumerate(f.readlines()):
        if i == 0:
            continue
        line = line.split(',')
        total_cas.append(str(int(line[0])))
        cas_dict[str(int(line[0]))] = line[1]

# Read CSV of dangerous CAS numbers
hazards = list()
cas_danger = dict()
with open("cas_info.csv", "r") as f:
    for line in f.readlines():
        line = line.split(',')
        hazards.append(str(line[0]))

# Associate CAS number with '- hazardous' or empty string
for cas in total_cas:
    if cas in hazards:
        cas_danger[cas] = ' - Hazardous'
    else:
        cas_danger[cas] = ''

min_sup = 0.025
min_conf = 0.40
total_rules = 0
final_rules = list()

print("Creating association rules...")
association_results = list(apriori(itemsets, min_support=min_sup, min_confidence=min_conf, min_length=2))

filtered = ['1', '7732185']  # Filters perfumes, water

for i, item in enumerate(association_results):
    # First index of the inner list
    # Contains base item and add item
    # print(item[1])
    pair = item[0] 
    items = [x for x in pair if x not in filtered]
    # rule = list()
    # Remove single item and unknown results
    if len(items) <= 1:
        continue
        
    r_side_ids = items[1:]
    r_side_names = list()
    for r_side_item in r_side_ids:
        r_side_names.append("{}{}\n".format(cas_dict[r_side_item].strip(), cas_danger[r_side_item]))
    assoc_prods = list()
    # Find items with these chemicals
    for key in products.keys():
        flag = 1
        for chem in items:
            if chem not in products[key]:
                flag = 0
        if flag:
            assoc_prods.append(key)
        
    print("Rule: " + cas_dict[items[0]] + " -> " + " | ".join(r_side_names))
    # rule.append("{}->{}".format(items[0], items[1:]))
    # Second index of the inner list
    print("Support: " + str(item[1] * 100))
    # rule.append(str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2] * 100))
    print("Lift: " + str(item[2][0][3]))
    print("Product Examples:\n{}".format("\n".join(assoc_prods[:5])))
    print("=====================================")
    # rule.append(str(item[2][0][2]))
    # rule.append(str(item[2][0][3]))
    total_rules += 1
    # final_rules.append(rule)
    
print("Total rules: {}".format(total_rules))
print("MinSup: {}%".format(min_sup * 100))
print("MinConf: {}%".format(min_conf * 100))

Creating association rules...
Rule: propane
 -> butane - Hazardous

Support: 6.061039239511114
Confidence: 95.17396184062851
Lift: 11.449517627792204
Product Examples:
tag body spray for men spin it
sprayway marvelous mango metered air freshener sw-116 aerosol-05/25/2015
rust-oleum stops rust gloss protective enamel antique white aerosol-05/10/2017
rust-oleum stops rust outdoor metallic burnished brass aerosol-11/15/2016
krylon covermaxx rust protection paint & primer gloss gum drop-9122 aerosol-10/28/2016
Rule: titanium dioxide
 -> calcium carbonate (limestone) - Hazardous

Support: 2.5016081766850116
Confidence: 49.857549857549856
Lift: 3.4704327366018908
Product Examples:
aqua mix grout colorant all colors-07/14/2017
sherwin-williams harmony interior acrylic latex zero voc eg-shel extra white b09w01051-03/02/2018
national gypsum kal kote f veneer plaster
sherwin-williams eminence high performance ceiling paint flat bright white a27w02815-03/02/2018
ge groov exterior-interior caulk m

In [36]:
'''
Implementation of the cosine similarity metric
Reads URLs from 'urls.txt'
Current batch of URLs is of the 10 products labelled as exfolients
URLS MUST BE COPIED FROM PROD_DETAILS.CSV
'''

import csv
import itertools
import math

# Given a list of links, perform cosine similarity between each pair, rank by similarity

# Read URL list
urls = list()
with open('urls.txt', 'r') as f:
    for line in f.readlines():
        urls.append(line.strip())
        
# Construct list of CAS numbers
cas = list()
cas_dict = dict()
with open('cas.csv', 'r') as f:
    for i, line in enumerate(f.readlines()):
        if i == 0:
            continue
        line = line.split(',')
        cas.append(str(int(line[0])))
        cas_dict[str(int(line[0]))] = line[1]

# Construct list of hazardous chemicals
hazardous = list()
with open('cas_info.csv', 'r', newline='') as f:
    for line in f.readlines():
        hazardous.append(line.split(',')[0])
        
# Open prod_details and construct vectors
cas_vectors = dict()
# Associate URL with product name
prod_name = dict()
prod_chems = dict()
with open('prod_details.csv', 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        
        # First, check if URL is in list
        row_url = row['URL'].strip()
        if row_url not in urls:
            continue
            
        prod_name[row_url] = row['Name']
            
        # If URL is present, get list of CAS numbers
        prod_cas = row["Chemicals"].split('|')
        prod_chems[row_url] = prod_cas
        
        # Create vector
        vector = list()
        for chem in cas:
            if chem in prod_cas:
                vector.append(1)
            else:
                vector.append(0)
        cas_vectors[row_url] = vector
        
# Generate unordered URL pairs
url_pairs = list()
for pair in itertools.combinations(urls, 2):
    url_pairs.append(pair)

# Function for cosine similarity
def cos_sim(x, y):
    # Dot product of x and y
    x_dot_y = 0
    for i in range(len(x)):
        x_dot_y += (x[i] * y[i])
        
    # Vector length
    len_x = 0
    len_y = 0
    for val in x:
        len_x += val**2
    for val in y:
        len_y += val**2
    len_x = math.sqrt(len_x)
    len_y = math.sqrt(len_y)
    
    if not len_x or not len_y:
        return 90
    
    return math.degrees(math.acos(x_dot_y / (len_x * len_y)))

# Calculate cosine similarity for each pair
results = dict()
for i, pair in enumerate(url_pairs):
    results[i] = cos_sim(cas_vectors[pair[0]], cas_vectors[pair[1]])
    
# Sort on cosine similarity
# Smaller angle means more similar, so no reverse is needed
sort_results = sorted(results.items(), key=lambda x: x[1], reverse=False)
result_n = 1

# Quite a bit of spaghetti here (sorry)

for result in sort_results:
    prod_name_list = list()
    result_chems = [[], []]
    for i, url in enumerate(url_pairs[result[0]]):
        prod_name_list.append(prod_name[url])
        for cas_id in prod_chems[url]:
            if cas_id:
                cas_str = cas_dict[cas_id]
                if cas_id in hazardous:
                    cas_str = cas_str.strip()
                    cas_str += " - Hazardous\n"
                result_chems[i].append(cas_str)
                
    # print("{}. {} : Angle = {}deg".format(result_n, ', '.join(prod_name_list), round(result[1], 2)))
    print("{}.".format(result_n))
    print("SIMILARITY ANGLE: {}deg\n".format(round(result[1], 2)))
    print("Product 1: {}".format(prod_name_list[0]))
    print("Contents:\n{}".format("".join(result_chems[0])))
    print("+++")
    print("Product 2: {}".format(prod_name_list[1]))
    print("Contents:\n{}".format("".join(result_chems[1])))
    print("-"*100)
    result_n += 1
    

1.
SIMILARITY ANGLE: 44.85deg

Product 1: olay fresh effects bead me up exfoliating cleanser-02/13/2015
Contents:
acetic acid 2-chloro- sodium salt (1:1) reaction products with 45-dihydro-2-undecyl-1h-imidazole-1-ethanol and sodium hydroxide
sodium myristoyl sarcosinate
sodium trideceth sulfate
citric acid
water
sodium lauroamphoacetate
glycerin - Hazardous
sorbitol
acrylates copolymer
citric acid
polyethylene
fragrance(s)/perfume(s)
titanium dioxide - Hazardous
propylene glycol
peg-120 methyl glucose trioleate
peg-100
disodium edta
sodium ascorbyl phosphate
camellia sinensis extract
niacinamide (vitamin b)
panthenol
tocopheryl acetate
ferric ammonium ferrocyanide
methylchloroisothiazolinone
methylisothiazolinone

+++
Product 2: olay fresh effects oots deep pore clean plus exfoliating scrub essence of honeysuckle and white tea-03/10/2015
Contents:
acetic acid 2-chloro- sodium salt (1:1) reaction products with 45-dihydro-2-undecyl-1h-imidazole-1-ethanol and sodium hydroxide
sodium myris

In [37]:
'''
Test of fuzzy matching on company names and product categories
Fortunately the names listed in CPID are clean and consistent, so this ended up being somewhat useless
'''

import csv
import itertools
from fuzzywuzzy import fuzz

# Test fuzzy matching on company names, categories

# Read in companies, categories
companies = list()
categories = list()
with open('prod_details.csv', 'r', newline='') as f:
    reader = csv.DictReader(f)
    for row in reader:
        companies.append(row['Manufacturer'])
        for cat in row['Categories'].split('|'):
            categories.append(cat)

# Create combinations
companies = set(companies)
man_pairs = list()
for pair in itertools.combinations(companies, 2):
    man_pairs.append(pair)
    
# Fuzzy matching, prints pairs above threshold
print('\n'.join([str(pair) for pair in man_pairs if fuzz.ratio(pair[0], pair[1]) > 80]))

# Categories
categories = set(categories)
cat_pairs = list()
for pair in itertools.combinations(categories, 2):
    cat_pairs.append(pair)
    
# Fuzzy matching, prints pairs above threshold
print('\n'.join([str(pair) for pair in cat_pairs if fuzz.ratio(pair[0], pair[1]) > 80]))

('Eclectic Products, Inc.', 'Elmers Products, Inc.')
('DAP Products, Inc.', 'Avon Products, Inc.')
('DAP Products, Inc.', 'PMS Products Inc.')
('DAP Products, Inc.', 'Fomo Products, Inc.')
('DAP Products, Inc.', '303 Products, Inc.')
('PPG Industries', 'TR Industries')
('Avon Products, Inc.', 'Fomo Products, Inc.')
('Avon Products, Inc.', '303 Products, Inc.')
('Avon Products, Inc.', 'Bonide Products, Inc.')
('Avon Products, Inc.', 'Beaumont Products, Inc.')
('American Formulating & Manufacturing (AFM)', 'American Formulating & Manufacturing')
('GoJo Industries, Inc.', 'Scotwood Industries, Inc.')
('GoJo Industries, Inc.', 'CRC Industries, Inc.')
('GoJo Industries, Inc.', 'Medo Industries, Inc.')
('Howard Products, Inc.', 'Method Products, Inc.')
('Howard Products, Inc.', 'Cramer Products, Inc.')
('Howard Products, Inc.', 'Bonide Products, Inc.')
('Envirocon Technologies, Inc.', 'Fiberlock Technologies, Inc.')
('Envirocon Technologies, Inc.', 'Virox Technologies Inc.')
('Roebic Laborat

In [39]:
'''
Prints general information about common manufacturers, 
what kind of products they make, how many of their products contain harmful ingredients
'''

import csv

# Perform basic analysis on company information
# For the sake of simplicity, only finding 10 companies of interest for each part

# Read in product information
with open('prod_details.csv', 'r', newline='') as f:
    reader = csv.DictReader(f)
    
    # Fields of interest
    categories = list()  # Elements in form [company, [categories]]
    products = list()  # Elements in form [company, [ingredients]]
    
    for row in reader:
        categories.append([row['Manufacturer'], row['Categories'].split('|')])
        products.append([row['Manufacturer'], row['Chemicals'].split('|')])

# Companies with the most products on the market
company_dict = dict()
for product in products:
    if product[0] in company_dict.keys():
        company_dict[product[0]] += 1
    else:
        company_dict[product[0]] = 1
# Sort on most common
sort_results = sorted(company_dict.items(), key=lambda x: x[1], reverse=True)
# Print first 10 results
top_companies = list()
for row in sort_results[:10]:
    print("{} -> {} products on the market".format(row[0], row[1]))
    top_companies.append(row[0])
    
print("-"*40)

# Common product categories for prolfic companies
category_dict = dict()
for company in top_companies:
    category_dict[company] = dict()
    
# Create counts for each product category in each company
for product in categories:
    if product[0] not in top_companies:
        continue
    for category in product[1]:
        if category in category_dict[product[0]].keys():
            category_dict[product[0]][category] += 1
        else:
            category_dict[product[0]][category] = 1

# Display results
# Products can have multiple categories
for company in category_dict.keys():
    sort_results = sorted(category_dict[company].items(), key=lambda x: x[1], reverse=True)
    print(company)
    for row in sort_results[:10]:
        print("{} -> {} products".format(row[0], row[1]))
    print("-"*40)
            
# Companies most/least likely to make products containing harmful chemicals
# First create a list of chemicals considered harmful
hazardous = list()
with open('cas_info.csv', 'r', newline='') as f:
    for line in f.readlines():
        hazardous.append(line.split(',')[0])

# Iterate through products
hazard_prods = dict()
total_prods = dict()
for product in products:
    # Count products by manufacturer
    if product[0] in total_prods.keys():
        total_prods[product[0]] += 1
    else:
        total_prods[product[0]] = 1
    if product[0] not in hazard_prods.keys():
        hazard_prods[product[0]] = 0
    
    # Only continue if a product has a hazardous chemical in it
    if not [chem for chem in product[1] if chem in hazardous]:
        continue

    # Increment
    hazard_prods[product[0]] += 1
                 
# Find 10 companies with the highest/lowest proportion of products with harmful chemicals
proportions = list()
for key in total_prods.keys():
    # Only consider companies with 50+ products
    if total_prods[key] >= 50:
        proportions.append((key, float(hazard_prods[key] / total_prods[key])))

# Sort
sort_results = sorted(proportions, key=lambda x: x[1], reverse=True)
for result in sort_results[:15]:
    print("{}\t-\t{}% of products contain harmful chemicals".format(result[0], round(result[1]*100, 2)))
print("-"*40)
sort_results.reverse()
for result in sort_results[:15]:
    print("{}\t-\t{}% of products contain harmful chemicals".format(result[0], round(result[1]*100, 2)))
print("-"*40)


Procter & Gamble Co. -> 1461 products on the market
SC Johnson, Inc. -> 834 products on the market
Reckitt, Inc. (Reckitt Benckiser) -> 547 products on the market
Unilever -> 484 products on the market
Clorox Company, The -> 298 products on the market
Sherwin-Williams Company -> 279 products on the market
Dial Corporation, The -> 247 products on the market
Glidden Co., The -> 226 products on the market
Rust-Oleum Corp. -> 213 products on the market
Church & Dwight Co., Inc. -> 208 products on the market
----------------------------------------
Procter & Gamble Co.
personal care -> 975 products
inside the home -> 439 products
hair care -> 345 products
personal cleanliness -> 255 products
laundry -> 218 products
men's products -> 200 products
detergent -> 163 products
fabric -> 131 products
conditioner -> 122 products
air freshener -> 120 products
----------------------------------------
SC Johnson, Inc.
inside the home -> 702 products
air freshener -> 253 products
cleaner -> 222 product