In [None]:
import pandas as pd
pd.options.display.max_colwidth = 2000

In [None]:
df_other_names = pd.read_csv('ingredient_other_names.csv')
df_other_names.head()

In [None]:
# first, create a new feature: "num_names", with lambda 

df_other_names['num_names'] = df_other_names.other_names.apply(lambda x: len(x.split(', ')))
df_other_names.head()

In [None]:
# What is the largest number of those numbers of other names?

df_other_names['num_names'].max()

In [None]:
# What is smallest one?

df_other_names['num_names'].min()

In [None]:
# What is the average one?

df_other_names['num_names'].mean()

In [None]:
# what about the median?   For a sequence: (3, 3, 4, 4, 4, 5, 5, 6, 50)  (mean: 9.3, median: 4, mode: 4)

df_other_names['num_names'].median()

In [None]:
df_other_names['num_names'].mode()

### Plot the distribution of number_of_names

In [None]:
%matplotlib inline

df_other_names['num_names'].hist(bins=100, figsize=(15,5))

In [None]:
df_other_names['num_names'].plot.kde(bw_method=.05, figsize=(10,5))

### Sort a pandas dataframe or table

sort the above table by the number of other names

In [None]:
df_other_names.sort_values('num_names')[:5]

In [None]:
df_other_names.sort_values('num_names', ascending=False)[:3]

### Find out the most common words used in the names

We want to calculate the frequency or the occurrences of each word

In [None]:
s = "Abebrodstrae, Adansonia, Adansonia bahoba, Adansonia baobab, Adansonia digitata, Adansonia situla, Adansonia somalensis, Adansonsia sphaerocarpa, Adansonia sulcata, Adansonie d' Afrique, Affenbrotbaum, African Baobab, Afrikaanse Kremetart, Afrikanischer Baobab, Albero Bottiglia, Albero di Mille Anni, Apebroodboom, Apenbroodboom, Arbre a Palabre, Arbre Bouteille, Arbre de Mille Ans, Bao Bap Chau Phi, Baob, Baoba, Baobab Africain, Baobaba, Baobabu, Baobab Afrykanski, Baobab Agaci, Baobab del Africa, Baobab Africain, Baobab Africano, Baobab de Mahajanga, Baobab de Mozambique, Baobab Fruit, Baobab Milk, Baobab of Mahajanga, Baobab Prstnaty, Baobab Seed, Baobab Seed Oil, Baobab Tree, Baobab Wlasciwy, Baobab Yemisi, Baovola, Bawbab, Boab, Boaboa, Boringy, Bottle Tree, Boy, Bozobe, Calebassier du Senegal, Cream-Tartar Tree, Dead Rat Tree, Dton Baobab, Ethiopian Sour Bread, Fruit de Baobab, Graine de Baobab, Gros Mapou, Harilik Ahvileivapuu, Hou Mian Bao Shu, Huile de Graines de Baobab, Imbondeiro, Judas Fruit, Judas Fruit Trees, Kremetart, Kremetartboom, Lait de Baobab, Maymun Ekmegi Agaci, Mboio, Mboy, Monkey Bread Tree, Noce d'Egitto, Pain de Singe, Rainiala, Reniala, Ringy, Sefo, Shagar El Bawbab, Shagar Khubz El Qurud, Sour Gourd, Upside-Down Tree, Vanoa, Vontana"
s2 = s.lower().replace(', ', ' ')
s2

In [None]:
s2.split()

In [None]:
len(s2.split())

### A simple example of counting the occurrences of unique elements in a list

- Suppose we have a list ['apple', 'banana', 'orange', 'banana', 'banana', 'apple'], 
- how can we find out there are 3 bananas, 2 apples, and 1 orange.

We will use a data structure, called dictionary or dict to do it.

A dict has two parts: key and value. Here, the key is the fruit name, and the value is the occurrences of the fruit name


Setup: ( 0 => zero count )
```
fruit_count = {}
fruit_count['apple'] = 0
fruit_count['banana'] = 0
fruit_count['orange'] = 0
print(fruit_count)
```

When we see an apple, we add 1 to  `fruit_count['apple']`
```
fruit_count['apple'] += 1
print(fruit_count)
```

In general, when we see a fruit, we add 1 to  `fruit_count[fruit]`
```
fruit = 'orange'
fruit_count[fruit] += 1
print(fruit_count)

fruit = 'apple'
fruit_count[fruit] += 1
print(fruit_count)
```

In [None]:
# Play with the above statements, or by uncommenting the following stuff one by one

# fruit_count = {}
# print(fruit_count)

# fruit_count['apple'] = 0
# fruit_count['banana'] = 0
# fruit_count['orange'] = 0
# print(fruit_count)

# fruit_count['apple'] += 1
# print(fruit_count)

# fruit = 'apple'
# fruit_count[fruit] += 1
# print(fruit_count)

# fruit = 'orange'
# fruit_count[fruit] += 1
# print(fruit_count)


In [None]:
fruit_cnt = {}
for fruit in ['apple', 'banana', 'pear', 'banana', 'banana', 'apple']:
    if fruit not in fruit_cnt:
        fruit_cnt[fruit] = 0
    fruit_cnt[fruit] += 1

fruit_cnt

### Back to our word counting problem

In [None]:
word_freq = {}

for w in s2.split():
    if w not in word_freq:
        word_freq[w] = 0
    word_freq[w] += 1

word_freq

### The above print out is sorted by the words alphabetically

What if we want to sort word_freq by the freq instead of the word?

In [None]:
fruit_cnt

In [None]:
fruit_cnt.keys()

In [None]:
fruit_cnt.values()

In [None]:
fruit_cnt.items()

In [None]:
list(fruit_cnt.items())

In [None]:
item = list(fruit_cnt.items())[0]
item

In [None]:
item[0]

In [None]:
item[1]

In [None]:
sorted(fruit_cnt.items())

In [None]:
sorted(fruit_cnt.items(), key=lambda item: item[1])

In [None]:
sorted(fruit_cnt.items(), key=lambda item: item[1], reverse=True)

### Another example of using the "key" parameter in sorted( )

Sort a list ['apple', 'pear', 'banana'] by the size of the name (how many letters in a name)
```
['apple', 'pear', 'banana']
=> 
['pear', 'apple', 'banana']
```

In [None]:
names = ['apple', 'pear', 'banana']
# sorted(names, key=lambda name: ??)

### Now back to sorting our word_freq

In [None]:
sorted_word_freq = sorted(word_freq.items(), key=lambda item: item[1], reverse=True)
sorted_word_freq

In [None]:
from IPython.display import Image
Image(url='https://upload.wikimedia.org/wikipedia/commons/thumb/a/a8/Adansonia_grandidieri04.jpg/180px-Adansonia_grandidieri04.jpg')

### Learn a new Pandas display setting:

pd.options.display.width

In [None]:
df_unsafe = pd.read_csv('unsafe_by_mouth.csv')
print(df_unsafe.shape)
print(df_unsafe[:3])

In [None]:
print(pd.options.display.width)
len('0    aconite  https://www.webmd.com/vitamins/ai/ingredientmono-609/aconite')

In [None]:
pd.options.display.width = 500
df_unsafe = pd.read_csv('unsafe_by_mouth.csv')
print(df_unsafe.shape)
print(df_unsafe[:4])

### Merge df_other_names and df_unsafe

In [None]:
df = pd.merge(df_other_names, df_unsafe)
print(df.shape)
df[:2]

In [None]:
df.sort_values('num_names', ascending=False)[:3]

### Add the full "side effects" paragraph that we extracted before

The side effects will be used in HTML page for easy browse

In [None]:
df_side_effects = pd.read_csv('ingredient_side_effects.csv')
print(df_side_effects.shape)
df_side_effects[:3]

In [None]:
df = pd.merge(df, df_side_effects)
df[:1]

### Need more processing for special HTML entities

For example, in the original display
```
1. "&gt;" is ">"
2. "&egrave;" is "è" (which will be converted to letter 'e' after removing accent)
```

After HTML rendering:

1. "&gt;" is ">"
2. "&egrave;" is "è"


In [None]:
import html

s = 'Bulbif&egrave;re'
s = html.unescape(s)
print(s)

In [None]:
import unidecode

s = unidecode.unidecode(s)
print(s)

In [None]:
df.other_names = df.other_names.apply(lambda s: unidecode.unidecode(html.unescape(s)))
df.iloc[28]['other_names']

### Generate a HTML page for easy browse and exploration

A third way to make a string: a multi-line string with 
"""
your 
multi-line
str 
here
"""

In [None]:
out = """
<style>
body {width: 960px; margin:auto; margin-top:10px; font-family:arial}
a {text-decoration: none; font-size:120%; white-space:nowrap}
table {border-collapse: collapse}
td {border-right: 1px solid #eee}
</style>
<table cellpadding=8>
"""

import re
no = 1
for ingredient, other_names, href, side_effects in zip(df.ingredient, df.other_names, df.href, df.side_effects):
#     side_effects = re.sub(r'\s+(,|\.)', r'\1', side_effects)
    
    row = f"""<tr valign=top>
    <td>{no}
    <td align=right><a target=_blank href=\"{href}\">{ingredient}</a>
    <td>{side_effects}
    <td>{other_names}
    </tr>"""
    out += row
    no += 1
    
html_outfile = '/tmp/a.html'
open(html_outfile, 'w').write(out)
html_outfile

## Learn Regular Expressions (optional)

check https://developers.google.com/edu/python/regular-expressions
for more on Regular Expressions

In [None]:
s = "Aga is UNSAFE when taken by mouth . It sleepiness , confusion, dizziness  , delirium    , and death   ."
s.replace(' ,', ',')

In [None]:
import re  # re: Regular Expression

re.sub(r' +,', ',', s)

### meta-characters, need to use \ to get rid of the special meaning
```

. ^ $ * + ? { [ ] \ | ( ) 

```

Their special meaning
```
. => match any character (except new line)
  e.g., 

^ => match the beginning

* => repeat the previous character or a block any times (that is, zero or more times)

+ => repeat the previous character or a block at least once

? => match or not match the previous character or a block

\ => escape the special meaning

| => or

( ) => define a block, or something you want to extract

[ ] => 

```

In [None]:
re.sub(r' +.', '.', s)

In [None]:
re.sub(r' +\.', '.', s)

In [None]:
re.sub(r' +(\.|,)', '.', s)