## Imports

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from pathlib import Path
import re
import json
import numpy as np
from pathlib import Path

## Variables

In [3]:
url = "https://unicode.org/emoji/charts/full-emoji-list.html"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")

# shortcodes fetched from https://unpkg.com/emojilib@2.4.0/emojis.json
# from the package Emojilib found here https://github.com/muan/emojilib
shortcodes_df = pd.read_json('../emoji_shortcodes.json', orient='index')

CSV_OUTPUT_DIR = str(Path.cwd()) + "/emojis.csv"
PACKAGE_OUTPUT_DIR = str(Path.cwd()) + "/espanso_package/package.yml"

table_rows = soup.find_all('tr')

## Functions

In [6]:
def combine_codes(code):
    # convert codes to string and convert to correct format
    code_list = code.split("+")
    emoji_code = code_list[-1]
    length = len(emoji_code)
    diff = int(8 - length)
    zeros = str("0" * diff)
    return "\\U" + zeros + emoji_code

In [None]:
def shortcode_cleanup(shortcode):
    shortcode = shortcode.replace("-", " ") # remove - (e.g. see-no-evil_monkey)
    shortcode = shortcode.replace(".", "") # remove . (e.g. mrs._claus)
    shortcode = shortcode.replace(":", "") # replace : (e.g. woman: blond hair)

    return shortcode

In [4]:
def convert_to_spaces(df, convert):
    for index in df.index:
        if convert == True:
            df.loc[index, "shortcode"] = df.loc[index, "shortcode"].replace("_", " ")
        elif convert == False:
            df.loc[index, "shortcode"] = df.loc[index, "shortcode"].replace(" ", "_")

In [2]:
def generate_yaml(df):
    # define yml elements
    matches = "matches:\n"
    trigger = "  - trigger: "
    replace = "\n    replace: "

    body = ""

    # iterate through df and add to output string
    for index, row in df.iterrows():
        name = row["shortcode"]
        code = row["unicode"]

        name = "\":" + name + ":\""
        code = "\"" + code + "\""

        emoji_item = trigger + name + replace + code + "\n"
        body = body + emoji_item

    yml_output = matches + body

    # save to yml file
    with open(PACKAGE_OUTPUT_DIR, "w") as f:
        f.write(yml_output)

## Fetching
Scrape emojis from unicode site and create database

In [7]:
# create dictionary to store values
data = {"unicode": [],
        "chars": [],
        "fullname": []}

# iterate through rows in <tr>
for rows in table_rows:
    # get lists of all the <td> with class of "code" or "name"
    emoji_codes = rows.find_all(class_="code")
    emoji_names = rows.find_all(class_="name")
    emoji_chars = rows.find_all(class_="chars")

    for code in emoji_codes:
        unicode = str(code.text)
        code_list = unicode.split(" ")
        emoji_code = "".join(list(map(combine_codes, code_list)))

        data["unicode"].append(emoji_code)

    for char in emoji_codes:
        data["chars"].append(char.text)

    for name in emoji_names:
        # convert names to strings, remove ⊛ symbols and reformat certain emojis (e.g. family and flags)
        emoji_name = str(name.text)

        if emoji_name[0] == "⊛":
            emoji_name = emoji_name[2:]

        data["fullname"].append(emoji_name.lower())

## Parsing
Process shortcodes from json and convert emojis to unicode characters so that they can be matched to the main dataframe.

In [8]:
shortcodes_dict = {"unicode": [],
                    "shortcode": []}

for row in shortcodes_df.index:
    emoji = shortcodes_df.loc[row, "char"]
    unicode_string = emoji.encode('unicode-escape').decode('ASCII').upper()

    shortcodes_dict["unicode"].append(unicode_string)
    shortcodes_dict["shortcode"].append(row)

shortcode_lookup = pd.DataFrame(data=shortcodes_dict)
shortcode_lookup.head()

Unnamed: 0,unicode,shortcode
0,\U0001F600,grinning
1,\U0001F62C,grimacing
2,\U0001F601,grin
3,\U0001F602,joy
4,\U0001F923,rofl


In [9]:
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,unicode,chars,fullname
0,\U0001F600,U+1F600,grinning face
1,\U0001F603,U+1F603,grinning face with big eyes
2,\U0001F604,U+1F604,grinning face with smiling eyes
3,\U0001F601,U+1F601,beaming face with smiling eyes
4,\U0001F606,U+1F606,grinning squinting face


## Match Shortcodes
Merge the shortcode lookup dataframe with the complete emoji dataframe by unicode code. Any non-matching emojis without shortcodes will be left NaN and then replaced with the fullname.

In [10]:
combined_df = df.merge(shortcode_lookup, on="unicode", how="left")
combined_df["shortcode"].fillna(combined_df["fullname"], inplace=True)

combined_df.head()

convert_to_spaces(combined_df)

grinning
smiley
smile
grin
laughing
sweat_smile
rofl
joy
slightly_smiling_face
upside_down_face
melting face
wink
blush
innocent
smiling_face_with_three_hearts
heart_eyes
star_struck
kissing_heart
kissing
smiling face
kissing_closed_eyes
kissing_smiling_eyes
smiling face with tear
yum
stuck_out_tongue
stuck_out_tongue_winking_eye
zany
stuck_out_tongue_closed_eyes
money_mouth_face
hugs
hand_over_mouth
face with open eyes and hand over mouth
face with peeking eye
shushing
thinking
saluting face
zipper_mouth_face
raised_eyebrow
neutral_face
expressionless
no_mouth
dotted line face
face in clouds
smirk
unamused
roll_eyes
grimacing
face exhaling
lying_face
shaking face
relieved
pensive
sleepy
drooling_face
sleeping
mask
face_with_thermometer
face_with_head_bandage
nauseated_face
vomiting
sneezing_face
hot
cold
woozy
dizzy_face
face with spiral eyes
exploding_head
cowboy_hat_face
partying
disguised face
sunglasses
nerd_face
monocle
confused
face with diagonal mouth
worried
slightly_frowning_

## Generate Espanso Yaml Package

In [None]:
generate_yaml(combined_df)