# Web Scraping

## Emojipedia

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import time, sleep
from IPython.display import clear_output
from random import randint

In [2]:
base_url = "https://emojipedia.org"

In [3]:
categories = ["people", "nature", "food-drink", "activity", "travel-places", "objects", "symbols", "flags"]

In [4]:
# Return parsed web page
def parsed_page(url):
    response = requests.get(url)
    
    # Send a warning if Response code isn't 200
    #if response.status_code != 200:
    #    warn(f'Request for url : {url} has code: {response.status_code}')
    content = response.content
    parser = BeautifulSoup(content, 'html.parser')
    return parser

In [5]:
parser = parsed_page("https://emojipedia.org/grinning-face-with-big-eyes/")

In [6]:
base_code_url = "https://emojipedia.org/emoji/" # url with specific code description

# Function scaping needed information on 1 page
def one_page_scraping(url, base_code_url):    
    parser = parsed_page(url)
    # Get the symbol
    emoji_symbol = parser.select("h1 .emoji")[0].text
    # Get only the name
    emoji_name = parser.find("h1").text[1:].strip()
    # Get the description (without the last paragraph saying when the emoji was added to the library)
    emoji_description = ""
    emoji_descr_list = parser.select(".description p")
    for paragraph in emoji_descr_list[:-1]: # Selection all but the last paragraph
        #print(paragraph.text)
        emoji_description += paragraph.get_text(strip=True).replace('\xa0','')
    
    # Get the emoji_code
    parser_code = parsed_page(base_code_url + emoji_symbol)
    emoji_code_list = parser_code.select(".emoji-detail td")
    if len(emoji_code_list)<3:
        emoji_code = ""
    else:
        emoji_code = emoji_code_list[3].text
    
    
    emoji_attr = {"emoji_symbol": emoji_symbol, "emoji_name": emoji_name, "emoji_code":emoji_code, 
                   "emoji_description":emoji_description }

    return emoji_attr

In [7]:
# We'll control the rate at which our requests are sent to avoid our IP address to be banned
start_time = time()
request_num = 0

# Exploring the different categories
list_emoji_attr = [] # we'll use it to add keep the different dictionnaries for each emoji
for cat in categories:
    print("We are scraping the emojis of category : ", cat)
    clear_output(wait=True)
    
    cat_url = base_url + "/" + cat
    # get the page
    parser = parsed_page(cat_url)
   
    # Get the list of url's of individual emoji pages for current category
    href_list = parser.select(".emoji-list a")
    
    # Going through all emojis of current categorie
    for href in href_list:  
        request_num += 1 # Updating resquests counter
        
        emoji_url = href.get('href')
        emoji_full_url = base_url + emoji_url
        emoji_attr = one_page_scraping(emoji_full_url, base_code_url)
        
        # Every 20 requests do a break of a few seconds
        if request_num%20 == 0:
            sleep(randint(3,15))
            elapsed_time = time() - start_time
            print(f'Request: {request_num}; Frequency: {request_num/elapsed_time}')
            clear_output(wait=True)
            
        list_emoji_attr.append(emoji_attr)
        
# Converting list of dictionaries to a pandas DataFrame
emoji_pedia_df = pd.DataFrame(list_emoji_attr)      

Request: 1760; Frequency: 0.594117586235585


In [8]:
print(emoji_pedia_df.shape)
emoji_pedia_df.sample(50)

(1764, 4)


Unnamed: 0,emoji_symbol,emoji_name,emoji_code,emoji_description
1513,🇦🇸,🇸 Flag: American Samoa,"U+1F1E6, U+1F1F8","The flag for American Samoa, which may show as..."
939,🚜,Tractor,U+1F69C,"A tractor, may be used by a farmer or other oc..."
1410,0️⃣,️⃣ Keycap Digit Zero,"U+30, U+FE0F, U+20E3",The Keycap Digit Zero emoji is akeycap sequenc...
1670,🇳🇫,🇫 Flag: Norfolk Island,"U+1F1F3, U+1F1EB","The flag forNorfolk Island, which may show as ..."
668,🥨,Pretzel,U+1F968,The heart-shaped twist of a hard or soft golde...
865,🏕️,️ Camping,"U+1F3D5, U+FE0F","A campsite featuring atentand atree, and perha..."
461,🐺,Wolf,U+1F43A,"The face of a wolf, a howling canine that hunt..."
83,😩,Weary Face,U+1F629,"A yellow face with closed eyes, furrowed brows..."
1550,🇨🇳,🇳 Flag: China,"U+1F1E8, U+1F1F3","The flag forChina, which may show as the lette..."
406,🥼,Lab Coat,U+1F97C,A white coat worn in a science laboratory.


In [10]:
emoji_pedia_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   emoji_symbol       1764 non-null   object
 1   emoji_name         1764 non-null   object
 2   emoji_code         1764 non-null   object
 3   emoji_description  1764 non-null   object
dtypes: object(4)
memory usage: 55.2+ KB


In [12]:
emoji_pedia_df.sample(5)

Unnamed: 0,emoji_symbol,emoji_name,emoji_code,emoji_description
1274,🔔,Bell,U+1F514,"A gold or yellow bell, which iscommonly used o..."
1291,🚫,Prohibited,U+1F6AB,A red circle with a diagonal line through the ...
440,⛑️,️ Rescue Worker’s Helmet,"U+26D1, U+FE0F","A construction helmet, which is worn by thecon..."
184,👴,Old Man,U+1F474,"An elderly man gray(grey) hair, and visible wr..."
933,🚖,Oncoming Taxi,U+1F696,"A yellow taxicab,iconically seen inNew York Ci..."


 ### Looking at where some data is missing

In [24]:
# The cleaning will be done on a copy of the original df
emoji_df = emoji_pedia_df.copy()

#### 1. Emoji_code

In [25]:
# Assess
emoji_df[emoji_df["emoji_code"]==""]

Unnamed: 0,emoji_symbol,emoji_name,emoji_code,emoji_description
1408,#️⃣,️⃣ Keycap Number Sign,,"A hash key,sometimes referred to as a pound ke..."


Looking on emojipedia the symbol found for this emoji = U+23, U+FE0F

In [36]:
# Code
emoji_df.iloc[1408,2] = 'U+23, U+FE0F'

In [37]:
# Test
emoji_df[emoji_df["emoji_symbol"]==""]

Unnamed: 0,emoji_symbol,emoji_name,emoji_code,emoji_description


#### 2. emoji_description

In [41]:
# Assess
emoji_df[emoji_pedia_df['emoji_description']==""]

Unnamed: 0,emoji_symbol,emoji_name,emoji_code,emoji_description
153,🧠,Brain,U+1F9E0,
154,🦷,Tooth,U+1F9B7,
411,🧣,Scarf,U+1F9E3,
412,🧤,Gloves,U+1F9E4,
414,🧦,Socks,U+1F9E6,
...,...,...,...,...
1487,🔹,Small Blue Diamond,U+1F539,
1488,🔺,Red Triangle Pointed Up,U+1F53A,
1489,🔻,Red Triangle Pointed Down,U+1F53B,
1493,🔲,Black Square Button,U+1F532,


> Looking a random set of those emojis on Emojipedia, it seems that they don't have a description.
We'll take there name as description.

In [47]:
# Code 
emoji_df['emoji_description'] = emoji_df.apply(lambda row: row.emoji_name if row.emoji_description==""
                                               else row.emoji_description, axis=1)

In [50]:
# Test
print(emoji_df[emoji_df.emoji_description==""])
emoji_df.sample(20)

Empty DataFrame
Columns: [emoji_symbol, emoji_name, emoji_code, emoji_description]
Index: []


Unnamed: 0,emoji_symbol,emoji_name,emoji_code,emoji_description
684,🥙,Stuffed Flatbread,U+1F959,"A flatbread, like pita, stuffed withlettuce,to..."
1037,💎,Gem Stone,U+1F48E,"A jewel or gem, as set into aring. Depicted in..."
1202,🗿,Moai,U+1F5FF,"Amoai, one of the famed, giant stone statues o..."
1020,🎏,Carp Streamer,U+1F38F,"Japanesekoinobori, decorative, carp-shaped win..."
1238,🕜,One-Thirty,U+1F55C,One-Thirty
565,🌼,Blossom,U+1F33C,A flower that has blossomed. Depicted as a flo...
184,👴,Old Man,U+1F474,"An elderly man gray(grey) hair, and visible wr..."
1098,💶,Euro Banknote,U+1F4B6,"A banded stack of euro banknotes, the paper cu..."
1002,🦼,Motorized Wheelchair,U+1F9BC,A wheelchair with amotor. Distinct from♿ Wheel...
11,😊,Smiling Face with Smiling Eyes,U+1F60A,"A yellow face with smiling eyes and a broad, c..."


In [52]:
# Saving the dataframe as a csv file
emoji_df.to_csv("emojipedia_df_02.csv")