# I. Simple Text to Emoji

In [1]:
import pandas as pd
import numpy as np
import spacy 
from helper_functions import *

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
#!python -m spacy download en_core_web_lg

### A. Reading a text document through Spacy

In [3]:
aladdin = open("Books/aladdin_page.txt").read()
file = nlp(aladdin)

In [4]:
aladdin

'THE STORY OF ALADDIN AND HIS MAGICAL LAMP\nThere once lived, in one of the large and rich cities of China, a tailor, named Mustapha. He was very poor. He could hardly, by his daily labor, maintain himself and his family, which consisted only of his wife and a son.\n\nHis son, who was called Aladdin, was a very careless and idle fellow. He was disobedient to his father and mother, and would go out early in the morning and stay out all day, playing in the streets and public places with idle children of his own age.\n\nWhen he was old enough to learn a trade, his father took him into his own shop, and taught him how to use his needle; but all his father’s endeavors to keep him to his work were vain, for no sooner was his back turned than he was gone for that day. Mustapha chastised him; but Aladdin was incorrigible, and his father, to his great grief, was forced to abandon him to his idleness, and was so much troubled about him that he fell sick and died in a few months.\n\n\n\nAladdin, 

In [5]:
file

THE STORY OF ALADDIN AND HIS MAGICAL LAMP
There once lived, in one of the large and rich cities of China, a tailor, named Mustapha. He was very poor. He could hardly, by his daily labor, maintain himself and his family, which consisted only of his wife and a son.

His son, who was called Aladdin, was a very careless and idle fellow. He was disobedient to his father and mother, and would go out early in the morning and stay out all day, playing in the streets and public places with idle children of his own age.

When he was old enough to learn a trade, his father took him into his own shop, and taught him how to use his needle; but all his father’s endeavors to keep him to his work were vain, for no sooner was his back turned than he was gone for that day. Mustapha chastised him; but Aladdin was incorrigible, and his father, to his great grief, was forced to abandon him to his idleness, and was so much troubled about him that he fell sick and died in a few months.



Aladdin, who was no

### B. Sentence Tokenization

In [6]:
for num, sentence in enumerate(file.sents):
    print(f'{num}: {sentence}')

0: THE STORY OF ALADDIN AND HIS MAGICAL LAMP

1: There once lived, in one of the large and rich cities of China, a tailor, named Mustapha.
2: He was very poor.
3: He could hardly, by his daily labor, maintain himself and his family, which consisted only of his wife and a son.


4: His son, who was called Aladdin, was a very careless and idle fellow.
5: He was disobedient to his father and mother, and would go out early in the morning and stay out all day, playing in the streets and public places with idle children of his own age.


6: When he was old enough to learn a trade, his father took him into his own shop, and taught him how to use his needle; but all his father’s endeavors to keep him to his work were vain, for no sooner was his back turned than he was gone for that day.
7: Mustapha chastised him; but Aladdin was incorrigible, and his father, to his great grief, was forced to abandon him to his idleness, and was so much troubled about him that he fell sick and died in a few mon

### C. Keep only PROPN and NOUN per sentence

In [7]:
for i, sentence in enumerate(file.sents):
    print("")
    print( "\033[1m" + f'Sentence {i} :' + "\033[0m")
    for token in sentence:
        token_pos = token.pos_
        if token_pos == 'PROPN' or token_pos == 'NOUN':
            print(token)
            
    if i > 3:
        break


[1mSentence 0 :[0m
STORY
ALADDIN
LAMP

[1mSentence 1 :[0m
cities
China
tailor
Mustapha

[1mSentence 2 :[0m

[1mSentence 3 :[0m
labor
family
wife
son

[1mSentence 4 :[0m
son
Aladdin
fellow


### D. Loading the emoji_df and quick analysis

In [8]:
emoji_df = pd.read_csv("Saved_Variables/emoji_df.csv")

In [9]:
emoji_df.sample(5)

Unnamed: 0,emoji,name,group,sub_group,codepoints
3721,✖,multiply,Symbols,math,2716
549,👱🏿‍♀️,"woman: dark skin tone, blond hair",People & Body,person,1F471 1F3FF 200D 2640 FE0F
3531,⛓,chains,Objects,tool,26D3
771,🧏🏼‍♂️,deaf man: medium-light skin tone,People & Body,person-gesture,1F9CF 1F3FC 200D 2642 FE0F
2113,🏄🏽,person surfing: medium skin tone,People & Body,person-sport,1F3C4 1F3FD


In [10]:
emoji_df.shape

(4159, 5)

In [11]:
# How many duplicates based on the name
emoji_df[emoji_df.duplicated(subset='name')].shape

(864, 5)

In [12]:
emoji_no_dup = emoji_df.drop_duplicates(subset='name')

In [13]:
emoji_no_dup.shape

(3295, 5)

In [14]:
emoji_no_dup[emoji_no_dup.name.str.match('raised back')]

Unnamed: 0,emoji,name,group,sub_group,codepoints
168,🤚,raised back of hand,People & Body,hand-fingers-open,1F91A
169,🤚🏻,raised back of hand: light skin tone,People & Body,hand-fingers-open,1F91A 1F3FB
170,🤚🏼,raised back of hand: medium-light skin tone,People & Body,hand-fingers-open,1F91A 1F3FC
171,🤚🏽,raised back of hand: medium skin tone,People & Body,hand-fingers-open,1F91A 1F3FD
172,🤚🏾,raised back of hand: medium-dark skin tone,People & Body,hand-fingers-open,1F91A 1F3FE
173,🤚🏿,raised back of hand: dark skin tone,People & Body,hand-fingers-open,1F91A 1F3FF


> a lot of emoji's have following mention:
> - light skin tone
> - medium-light skin tone
> - medium skin tone,
> - medium-dark skin tone,
> - dark skin tone,

In [15]:
# We'll keep light and dark skin ston
print(emoji_no_dup[emoji_no_dup.name.str.contains('light skin tone')].shape)
print(emoji_no_dup[emoji_no_dup.name.str.contains('dark skin tone')].shape)

(620, 5)
(620, 5)


In [16]:
# We'll remove medium_light
medium_light = emoji_no_dup[emoji_no_dup.name.str.contains('medium-light skin tone')]
print(medium_light.shape)
emoji_condensed = emoji_no_dup[~emoji_no_dup.name.str.contains('medium-light skin tone')]

(314, 5)


In [17]:
# We'll remove light and dark skin tones
emoji_condensed = emoji_condensed[~emoji_condensed.name.str.contains('light skin tone')]
print(emoji_condensed.shape)
emoji_condensed = emoji_condensed[~emoji_condensed.name.str.contains('medium-light skin tone')]
print(emoji_condensed.shape)

(2675, 5)
(2675, 5)


In [18]:
# We'll remove medium
medium = emoji_no_dup[emoji_no_dup.name.str.contains('medium skin tone')]
print(medium.shape)
emoji_condensed = emoji_condensed[~emoji_condensed.name.str.contains('medium skin tone')]
print(emoji_condensed.shape)

(314, 5)
(2377, 5)


In [19]:
# We'll remove medium_dark
medium_dark = emoji_no_dup[emoji_no_dup.name.str.contains('medium-dark skin tone')]
medium_dark.shape
emoji_condensed = emoji_condensed[~emoji_condensed.name.str.contains('medium-dark skin tone')]
print(emoji_condensed.shape)

(2087, 5)


In [20]:
list_names = list(emoji_condensed.name)

In [21]:
len(list_names)

2087

### E. Finding Closest Emoji with similarity 

In [22]:
# Looking only a the nouns and proper nouns
for i, sentence in enumerate(file.sents):
    print("")
    print( "\033[1m" + f'Sentence {i} :' + "\033[0m")
    print(sentence)
    for token in sentence:
        token_pos = token.pos_
        if token_pos == 'PROPN' or token_pos == 'NOUN':
            closest_emoji = word2emoji(token, emoji_condensed)
            print(token, " --- EMOJI --->  ", closest_emoji)
            
    if i > 3:
        break


[1mSentence 0 :[0m
THE STORY OF ALADDIN AND HIS MAGICAL LAMP




[W008] Evaluating Token.similarity based on empty vectors.



STORY  --- EMOJI --->   👦
ALADDIN  --- EMOJI --->   🧞
LAMP  --- EMOJI --->   💡

[1mSentence 1 :[0m
There once lived, in one of the large and rich cities of China, a tailor, named Mustapha.
cities  --- EMOJI --->   🚌
China  --- EMOJI --->   📀
tailor  --- EMOJI --->   🪰
Mustapha  --- EMOJI --->   ♌

[1mSentence 2 :[0m
He was very poor.

[1mSentence 3 :[0m
He could hardly, by his daily labor, maintain himself and his family, which consisted only of his wife and a son.


labor  --- EMOJI --->   🔑
family  --- EMOJI --->   👪
wife  --- EMOJI --->   👦
son  --- EMOJI --->   👦

[1mSentence 4 :[0m
His son, who was called Aladdin, was a very careless and idle fellow.
son  --- EMOJI --->   👦
Aladdin  --- EMOJI --->   🧞
fellow  --- EMOJI --->   👨
