In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from pathlib import Path
import re
import json

In [8]:
url = "https://unicode.org/emoji/charts/full-emoji-list.html"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

# shortcodes fetched from https://unpkg.com/emojilib@2.4.0/emojis.json
# from the package Emojilib found here https://github.com/muan/emojilib
shortcodes_df = pd.read_json('../emoji_shortcodes.json', orient='index')

OUTPUT_DIR = str(Path.cwd()) + "/emojis.csv"

table_rows = soup.find_all('tr')

In [9]:
def combine_codes(code):
    # convert codes to string and convert to correct format
    code_list = code.split("+")
    emoji_code = code_list[-1]
    length = len(emoji_code)
    diff = int(8 - length)
    zeros = str("0" * diff)
    return "\\U" + zeros + emoji_code

In [18]:
# create dictionary to store values
data = {"code": [],
        "chars": [],
        "name": []}

# iterate through rows in <tr>
for rows in table_rows:
    # get lists of all the <td> with class of "code" or "name"
    emoji_codes = rows.find_all(class_="code")
    emoji_names = rows.find_all(class_="name")
    emoji_chars = rows.find_all(class_="chars")

    for code in emoji_codes:
        unicode = str(code.text)
        code_list = unicode.split(" ")
        emoji_code = "".join(list(map(combine_codes, code_list)))

        data["code"].append(emoji_code)

    for char in emoji_codes:
        data["chars"].append(char.text)

    for name in emoji_names:
        # convert names to strings, remove ⊛ symbols and reformat certain emojis (e.g. family and flags)
        emoji_name = str(name.text)

        if emoji_name[0] == "⊛":
            emoji_name = emoji_name[2:]

        # emoji_name = emoji_name.replace(" ", "_") # _'s or spaces
        # emoji_name = emoji_name.replace("’", "") # remove - (e.g. see-no-evil_monkey)
        # emoji_name = emoji_name.replace(".", "") # remove . (e.g. mrs._claus)
        # emoji_name = emoji_name.replace("-", "_") # replace ' (e.g. twelve_o’clock)
        # emoji_name = emoji_name.replace(":", "") # replace : (e.g. woman: blond hair)

        # emoji_name = emoji_name.replace(",", "")
        # emoji_name = emoji_name.replace("\"", "")
        # emoji_name = emoji_name.replace("flag_", "")
        # emoji_name = emoji_name.replace("o clock", "")
        # emoji_name = emoji_name.replace(" s ", "s ")
        # emoji_name = emoji_name.replace("“", "_")
        # emoji_name = emoji_name.replace("”", "_")

        data["name"].append(emoji_name.lower())

In [19]:
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,code,chars,name
0,\U0001F600,,grinning face
1,\U0001F603,,grinning face with big eyes
2,\U0001F604,,grinning face with smiling eyes
3,\U0001F601,,beaming face with smiling eyes
4,\U0001F606,,grinning squinting face
