### This notebook outlines the steps involved to scrape the list of emojis and their associated terms/ descriptions from the emoji cheat sheet into a csv dataset.

In [2]:
from urllib.request import Request, urlopen
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np
import time

In [3]:
site = "https://www.webfx.com/tools/emoji-cheat-sheet/"
hdr = {'User-Agent': 'Mozilla/5.0'}
bookpage = requests.get(site)
soup = BeautifulSoup(bookpage.text, "html.parser")
#print(soup.prettify())

In [4]:
soup.find_all('span', class_ ='name')

[<span class="name" data-alternative-name="classy, bow, face, formal, fashion, suit, magic, circus">bowtie</span>,
 <span class="name" data-alternative-name="happy, cheerful, face, joy, funny, haha, laugh, like">smile</span>,
 <span class="name" data-alternative-name="smile, guy, happy, cheerful, smiling">simple_smile</span>,
 <span class="name" data-alternative-name="lol, funny, happy, joy, satisfied, haha, face, glad">laughing</span>,
 <span class="name" data-alternative-name="face, smile, happy, flushed, crush, embarrassed, shy, joy">blush</span>,
 <span class="name" data-alternative-name="funny, face, happy, joy, haha">smiley</span>,
 <span class="name" data-alternative-name="face, blush, massage, happiness">relaxed</span>,
 <span class="name" data-alternative-name="face, smile, mean, prank, smug, sarcasm">smirk</span>,
 <span class="name" data-alternative-name="heart, love, face, like, affection, valentines, infatuation, crush">heart_eyes</span>,
 <span class="name" data-alternati

In [5]:
names = soup.find_all('span', class_ ='name')

In [6]:
# total number of emojis in the cheat sheet
len(names)

877

In [7]:
names[0].get_text()

'bowtie'

In [30]:
names[293]

<span class="name">waxing_gibbous_moon</span>

In [25]:
names[876].get_text()

'shipit'

In [8]:
# https://pypi.org/project/emoji/
import emoji

In [9]:
#syntax to print emojis
print(emoji.emojize('Python is :thumbs_up:'))

Python is 👍


In [10]:
text = "Python is "+ ":"+ "thumbs_up"+ ":"

In [11]:
text

'Python is :thumbs_up:'

In [12]:
print(emoji.emojize(text))

Python is 👍


In [13]:
# terms associated with the first emoji in the cheat sheet
soup.find('span', class_='name')['data-alternative-name']

'classy, bow, face, formal, fashion, suit, magic, circus'

In [14]:
desc = soup.find('span', class_='name')['data-alternative-name']

In [15]:
#split the string of words into a list of words
words = [x.strip() for x in desc.split(',')]

In [16]:
words

['classy', 'bow', 'face', 'formal', 'fashion', 'suit', 'magic', 'circus']

In [17]:
len(words)

8

In [18]:
# terms associated with the second emoji in the cheat sheet
soup.find_all('span', class_='name')[1]['data-alternative-name']

'happy, cheerful, face, joy, funny, haha, laugh, like'

In [40]:
#running a loop to scrape the list of emojis and their associated terms/ descriptions into a dataset
emojis = []
k = 0
startTime = time.time()
for j in range(0,len(names)):
    try:
        desc = soup.find_all('span', class_='name')[j]['data-alternative-name']
        words = [x.strip() for x in desc.split(',')]
    except:
        words = ['N.A']
    for i in range(0,len(words)):
        uid = j
        index = k
        name = names[j].get_text()
        wd = words[i]
        k = k+1
        emojis.append((uid,index,name,wd))
    time.sleep(0.1)
    #print(j)
endTime = time.time()
elapsedTime = endTime - startTime
print("Elapsed Time (in seconds) = %s" % elapsedTime)

Elapsed Time (in seconds) = 115.70929074287415


In [34]:
len(emojis)

3673

In [41]:
df = pd.DataFrame(np.array(emojis))

In [42]:
df.columns = ['id','index','name','desc']

In [43]:
df.head()

Unnamed: 0,id,index,name,desc
0,0,0,bowtie,classy
1,0,1,bowtie,bow
2,0,2,bowtie,face
3,0,3,bowtie,formal
4,0,4,bowtie,fashion


In [51]:
df.describe()

Unnamed: 0,id,index,name,desc
count,3673,3673,3673,3673
unique,877,3673,877,1214
top,475,2234,mortar_board,nature
freq,11,1,11,101


In [87]:
#saving the data to a csv file
df.to_csv('emojis.csv')

### Generating a random emoji

In [39]:
import random
random.seed(123)

In [83]:
#generate a random number between 0 and 876
rn = int(random.uniform(0,876))

In [84]:
#read in the scraped dataset hosted on github
url = 'https://raw.githubusercontent.com/hxchua/datadoubleconfirm/master/datasets/emojis.csv'
df = pd.read_csv(url, error_bad_lines=False)
a = list(df[df['id']==rn]['name'].drop_duplicates())
##a = list(df[df['id']==str(rn)]['name'].drop_duplicates())

In [85]:
a[0]

'radio'

In [86]:
text = "Python is "+ ":"+ a[0] + ":"
text
print(emoji.emojize((text), use_aliases=True))

Python is 📻


In [77]:
print(emoji.emojize('Python is :astonished:', use_aliases=True))

Python is 😲


In [75]:
print(emoji.emojize('Python is :thumbsup:', use_aliases=True))

Python is 👍


In [89]:
print(emoji.emojize('Python is :cool:', use_aliases=True))

Python is 🆒


In [96]:
emoji.emojize((":"+a[0]+":"), use_aliases=True)

'📻'

### Exploratory data analysis

In [99]:
# most common description word in descending order
df['desc'].value_counts().head(10)

nature            101
animal             82
blue-square        68
food               55
face               55
vehicle            41
transportation     36
japanese           29
like               29
time               28
Name: desc, dtype: int64

In [104]:
# list of emojis associated with food
list(df[df['desc']=='food']['name'])

['corn',
 'ice_cream',
 'blowfish',
 'custard',
 'grapes',
 'chestnut',
 'strawberry',
 'icecream',
 'doughnut',
 'watermelon',
 'peach',
 'lemon',
 'cherries',
 'pineapple',
 'tangerine',
 'banana',
 'bread',
 'lollipop',
 'sweet_potato',
 'pear',
 'candy',
 'chocolate_bar',
 'eggplant',
 'honey_pot',
 'cookie',
 'cake',
 'apple',
 'green_apple',
 'shaved_ice',
 'tomato',
 'bread',
 'melon',
 'rice_cracker',
 'ramen',
 'sushi',
 'fish_cake',
 'hamburger',
 'bento',
 'fries',
 'meat_on_bone',
 'fried_shrimp',
 'rice_ball',
 'curry',
 'rice',
 'fish',
 'pizza',
 'dango',
 'stew',
 'baby_bottle',
 'egg',
 'fishing_pole_and_fish',
 'oden',
 'egg',
 'poultry_leg',
 'spaghetti']

In [108]:
# print list of emojis associated with food
food = list(df[df['desc']=='food']['name'])
for i in range(0,len(food)):
    print(emoji.emojize((":"+food[i]+":"), use_aliases=True),end="")

🌽🍨🐡🍮🍇🌰🍓🍦🍩🍉🍑🍋🍒🍍🍊🍌🍞🍭🍠🍐🍬🍫🍆🍯🍪🍰🍎🍏🍧🍅🍞🍈🍘🍜🍣🍥🍔🍱🍟🍖🍤🍙🍛🍚🐟🍕🍡🍲🍼🍳🎣🍢🍳🍗🍝

In [110]:
# print list of emojis associated with girl
girl = list(df[df['desc']=='girl']['name'])
for i in range(0,len(girl)):
    print(emoji.emojize((":"+girl[i]+":"), use_aliases=True),end="")

👸💇🙆👵💆💄💁🙋👙🙎💃🙅🎀👶🙍

In [111]:
# print list of emojis associated with boy
boy = list(df[df['desc']=='boy']['name'])
for i in range(0,len(boy)):
    print(emoji.emojize((":"+boy[i]+":"), use_aliases=True),end="")

👶🙇👱👲

In [115]:
# list of emojis associated with woman
df[df['desc']=='woman']

Unnamed: 0,id,index,name,desc
672,138,672,haircut,woman
682,141,682,girl,woman
626,128,626,no_good,woman
2147,487,2147,lipstick,woman
630,129,630,information_desk_person,woman
2180,497,2180,bikini,woman
643,133,643,person_frowning,woman
668,137,668,massage,woman
612,125,612,dancer,woman
640,132,640,person_with_pouting_face,woman


In [114]:
# list of emojis associated with woman
df[df['desc']=='man']

Unnamed: 0,id,index,name,desc
1472,321,1472,santa,man
710,147,710,person_with_blond_hair,man
648,134,648,bow,man
728,151,728,cop,man
565,119,565,runner,man
677,140,677,boy,man


In [116]:
girl = list(df[df['desc']=='woman']['name'])
for i in range(0,len(girl)):
    print(emoji.emojize((":"+girl[i]+":"), use_aliases=True),end="")

💇👧🙅💄💁👙🙍💆💃🙎🙋👸🚺

In [117]:
boy = list(df[df['desc']=='man']['name'])
for i in range(0,len(boy)):
    print(emoji.emojize((":"+boy[i]+":"), use_aliases=True),end="")

🎅👱🙇👮🏃👦

In [121]:
# print list of emojis associated with a list of terms
desclist = ['boy','girl','man','woman','men','women']
for i in desclist:
    cc = list(df[df['desc']==i]['name'])
    for j in range(0,len(cc)):
        print(emoji.emojize((":"+cc[j]+":"), use_aliases=True),end="")
    print()

👶🙇👱👲
👸💇🙆👵💆💄💁🙋👙🙎💃🙅🎀👶🙍
🎅👱🙇👮🏃👦
💇👧🙅💄💁👙🙍💆💃🙎🙋👸🚺
👴
👵👘👯🙆
