In [None]:
import pandas as pd
import json
from pprint import pprint
pd.options.display.max_colwidth = 2000

### Find all the names of the 34 unsafe ingredients

In [None]:
df_unsafe = pd.read_csv('unsafe.csv')
print(df_unsafe.shape)
df_unsafe[:1]['ingredient other_names'.split()]

In [None]:
# Create a hash or dictionary lookup table for ingredient names
# A hash table is essentially a dictionary; it maps a key to a value.
# We use HASH here for fast checking 
# whether a word occurred in Amazon product description is in
# the hash table (or dictionary) of ingredient

import re

unsafe_name_hash = {}
for ingredient, other_names in zip(df_unsafe['ingredient'], df_unsafe['other_names']):
    unsafe_name_hash[ingredient] = True
    for name in other_names.split(', '):
        unsafe_name_hash[name] = True
print(len(unsafe_name_hash))

# normalize names by removing non-alphanumeric characters
# for example, we will convert Chuan-wu to Chuanwu
unsafe_name_hash_normalized = {}
for name in unsafe_name_hash:
    unsafe_name_normalized = re.sub(r'\W+', '', name.lower())  # \w: a-zA-Z0-9 and _,  \W non of the \w set
    unsafe_name_hash_normalized[unsafe_name_normalized] = True
print(len(unsafe_name_hash_normalized))
print(list(unsafe_name_hash_normalized)[:10])

In [None]:
# find the longest names (based on how many words or parts in a name)
# for example, "chuan wu" has two parts, so its size is 2.

longest_name = ""
longest_name_len = 0
for name in unsafe_name_hash:  # same as: for name in name_hash.keys()
    name_parts = name.split()
    name_parts_len = len(name_parts)
    if name_parts_len > longest_name_len:
        longest_name_len = name_parts_len
        longest_name = name
print(f"longest name: {longest_name}")
print(f"num of parts: {longest_name_len}")

### Learning time: JSON (Amazon data is represented in this format)

- construct a sort of complicated dictionary
- save the dictionary to a json file or string
- load a dictionary from a json file or string

### instructions - run the instructions here
```
student = {"id":123, 
          "name": {"first_name": "Joe", "last_name": "Smith"},
          "courses": [
              {"name": "Biology", "teacher": "Charles Darwin"},
              {"name": "Physics", "teacher": "Albert Einstein"},
              {"name": "Math", "teacher": "Carl Friedrich Gauss"}
          ]}

type(student)

from pprint import pprint
pprint(student)
```

```
import json
s = json.dumps(student)

type(s)

o = json.loads(s)

type(o)
```

```
json.dump(student, open('/tmp/a.json', 'w'))

student2 = json.load(open('/tmp/a.json', 'w'))

```

In [None]:
import json
amazon_health_json_fpath = "../data/amazon_sample_aconite.json"
asin_desc = {}
for line in open(amazon_health_json_fpath).readlines():
    print(line[:300])
    record = json.loads(line)
    pprint(record)
    break

In [None]:
import ast   # ast: abstract syntax tree

asin_desc = {}
amazon_health_json_fpath = "../data/amazon_sample_aconite.json"

for line in open(amazon_health_json_fpath).readlines():
    record = ast.literal_eval(line)
    pprint(record)
    break

In [None]:
record['title']

In [None]:
record['description']

In [None]:
record['categories']

<img src="http://ecx.images-amazon.com/images/I/413Tm3QXjBL._SY300_.jpg">

### download file meta_Health_and_Personal_Care.json

from: https://textmining.ischool.syr.edu/share/kelly/

save it to: firstdonoharm/data      (you may need to create a new folder "data")

In [None]:
import ast   # ast: abstract syntax tree

amazon_health_json_fpath = "../data/amazon_sample_aconite.json"
# amazon_health_json_fpath = "../data/meta_Health_and_Personal_Care.json"

asin_description = {}
for line in open(amazon_health_json_fpath).readlines():
    record = ast.literal_eval(line)
    asin = record['asin']
    categories = record['categories']
#     if not "Vitamins & Dietary Supplements" in categories[0]:  # for simplicity, we only consider the first list
#         continue
        
    title = record['title']
    desc = record['description']
    print(f"[TITLE] {title}\n[ASIN] {asin}\n[LINK] https://www.amazon.com/dp/{asin}\n\n{desc}\n\n")
    asin_description[asin] = title + " " + desc

### modify the above part to avoid the error 
```
missing_cnt = 0
for line in open(amazon_health_json_fpath).readlines():
    record = ast.literal_eval(line)
    asin = record['asin']
    categories = record['categories']
    if not "Vitamins & Dietary Supplements" in categories[0]:
        continue
        
    try:
        title = record['title']
        desc = record['description']
        asin_description[asin] = title + " " + desc
    except:
        missing_cnt += 1

print(f'missing_cnt: {missing_cnt}')
```

Also, add `%%time` at the beginning a cell to find out the time used to execute the cell


In [None]:
def find_unsafe_ingredient_name_in_amazon_text(desc, unsafe_name_hash):
    desc_normalized = re.sub(r'\W', ' ', desc.lower())
    found_names = {}
    for w in desc_normalized.split():
        if w in unsafe_name_hash and w not in found_names:
            found_names[w] = True
    return list(found_names.keys())


asin_unsafe_names_found = {}
for asin, desc in asin_description.items():
    unsafe_names_found = find_unsafe_ingredient_name_in_amazon_text(desc, unsafe_name_hash_normalized)
    if len(unsafe_unsafe_names_found) > 0:
        asin_unsafe_names_found[asin] = unsafe_names_found

print(len(asin_unsafe_names_found))

In [None]:
asin_unsafe_names_found

In [None]:
names = list(asin_unsafe_names_found.values())
asins = list(asin_unsafe_names_found.keys())
data = {'asin': asins, 'unsafe_names': names}

df_amazon = pd.DataFrame(data)
df_amazon.to_csv('/tmp/a.csv', index=False)
print(df_amazon.shape)
df_amazon.head()

# HOMEWORK: 

using the downloaded large amazon data to rerun the above code to generate a csv file, 
and take a look at the results. Something may need to do to improve our result. 
What can we do to improve the result?

### Generate an HTML page that contains links to relevant Amazon product page

For example, for the above product whose ASIN is B00008CMQ2,
we can construct a link: https://www.amazon.com/dp/B00008CMQ2

### Need more processing for special HTML entities

For example, in the original display
```
1. "&gt;" is ">"
2. "&egrave;" is "è" (which will be converted to letter 'e' after removing accent)
```

After HTML rendering:

1. "&gt;" is ">"
2. "&egrave;" is "è"


In [None]:
import html

s = 'Bulbif&egrave;re'
s = html.unescape(s)
print(s)

In [None]:
import unidecode

s = unidecode.unidecode(s)
print(s)

In [None]:
df.other_names = df.other_names.apply(lambda s: unidecode.unidecode(html.unescape(s)))
df.iloc[28]['other_names']

### Generate a HTML page for easy browse and exploration

A third way to make a string: a multi-line string with 
"""
your 
multi-line
str 
here
"""

In [None]:
out = """
<style>
body {width: 960px; margin:auto; margin-top:10px; font-family:arial}
a {text-decoration: none; font-size:120%; white-space:nowrap}
table {border-collapse: collapse}
td {border-right: 1px solid #eee}
</style>
<table cellpadding=8>
"""

import re
no = 1
for ingredient, other_names, href, side_effects in zip(df.ingredient, df.other_names, df.href, df.side_effects):
#     side_effects = re.sub(r'\s+(,|\.)', r'\1', side_effects)
    
    row = f"""<tr valign=top>
    <td>{no}
    <td align=right><a target=_blank href=\"{href}\">{ingredient}</a>
    <td>{side_effects}
    <td>{other_names}
    </tr>"""
    out += row
    no += 1
    
html_outfile = '/tmp/a.html'
open(html_outfile, 'w').write(out)
html_outfile

## Learn Regular Expressions (optional)

check https://developers.google.com/edu/python/regular-expressions
for more on Regular Expressions

In [None]:
s = "Aga is UNSAFE when taken by mouth . It sleepiness , confusion, dizziness  , delirium    , and death   ."
s.replace(' ,', ',')

In [None]:
import re  # re: Regular Expression

re.sub(r' +,', ',', s)

### meta-characters, need to use \ to get rid of the special meaning
```

. ^ $ * + ? { [ ] \ | ( ) 

```

Their special meaning
```
. => match any character (except new line)
  e.g., 

^ => match the beginning

* => repeat the previous character or a block any times (that is, zero or more times)

+ => repeat the previous character or a block at least once

? => match or not match the previous character or a block

\ => escape the special meaning

| => or

( ) => define a block, or something you want to extract

[ ] => 

```

In [None]:
re.sub(r' +.', '.', s)

In [None]:
re.sub(r' +\.', '.', s)

In [None]:
re.sub(r' +(\.|,)', '.', s)