# 1.Data Import and Combination

In [1]:
import pandas as pd
import gensim as gs
import nltk
import re
from nltk.corpus import stopwords
import sklearn as sk
import numpy as np
import pickle
from tqdm import tqdm
import os

## 1.1.import dataset

In [2]:
file_name={
    "emo_extraction":"calculate_emo_3",
}

In [3]:
#import data
def open_pkl_data(file_name):
    all_data=dict()
    for each_key in file_name.keys():
        name = file_name[each_key]
        pickle_file = open('DataSet\\'+name+".pkl",mode="rb")
        data = pickle.load(pickle_file)
        pickle_file.close()
        all_data[each_key]=data
    return all_data

In [4]:
emo_path="DataSet\\emo_collection.csv"
emo_df=pd.read_csv(emo_path,sep="|")
#emo_ex = open_pkl_data(file_name)["emo_extraction"]

In [5]:
emo_df

Unnamed: 0,emoji/emoticon,key words
0,😀,face grin grinning face
1,😃,face grinning face with big eyes mouth open smile
2,😄,eye face grinning face with smiling eyes mouth...
3,😁,beaming face with smiling eyes eye face grin s...
4,😆,face grinning squinting face laugh mouth satis...
...,...,...
2279,(　´Д｀)ﾉ(´･ω･`)　ﾅﾃﾞﾅﾃﾞ,"Patting, nade nade\n"
2280,(*ﾟﾉOﾟ)<ｵｵｵｵｫｫｫｫｫｫｫｰｰｰｰｰｲ!,"Calling out, ""Ooooi!""\n"
2281,( ﾟ∀ﾟ)ｱﾊﾊ八八ﾉヽﾉヽﾉヽﾉ ＼ / ＼/ ＼,Evil laugh (literally ahahaHAHA...)\n
2282,（・∀・ ）ヾ(- -；)コラコラ,"Blaming ""now now""\n"


In [6]:
meaning_values=list(emo_df["key words"])

In [7]:
meaning_values[-20:-1]

['Extreme Distaste, meant to appear as an exaggerated grimace\n',
 'Shouting\n',
 'Pretending not to notice, asleep because of boredom\n',
 'Kick\n',
 'Discombobulated\n',
 'Running\n',
 'Happy\n',
 'Happy\n',
 'Shocked\n',
 'Really angry\n',
 '"Do it"\n',
 'Angel\n',
 '"It\'s here", Kitaa!, excitement that something has appeared or happened or "I came".\n',
 'Girlish version of "It\'s here".\n',
 'Erotic stirring, haa haa\n',
 'Patting, nade nade\n',
 'Calling out, "Ooooi!"\n',
 'Evil laugh (literally ahahaHAHA...)\n',
 'Blaming "now now"\n']

# 2.Main Functions

In [8]:
def fit_transform(input_list,method):
    output_list = list()
    for text in tqdm(input_list):
        input_text = str(text)
        new_text = eval(method)(input_text)
        output_list.append(new_text)
    
    return output_list

# 3.Preprocessing

## 3.1. Lower the words

In [9]:
def lower_text(input_text):
    #lower the words
    input_text =input_text.lower()
    return input_text

In [10]:
meaning_values1 = fit_transform(meaning_values,method="lower_text")

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 176189.77it/s]


In [11]:
len(meaning_values1)

2284

## 3.2. Seperate the adhered words with space
 - e.g. "you?" ? and you are adhered together, which needs to be split

In [12]:
# functions for assigning emoji, emoticons and punctuations space from each other
def assign_space(input_text):
    #punctuation
    input_text = re.sub(r'[^\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸](\!|\?|\.+)[^\d\/\*\:\)\.\?\^\;?\-_\'~!\<\>\=\"#&$%\\\{\}\|\[\]ç\+ω○\@¡éı・…¡\`：）♡ӳ！“”à≧∇≦♂ş≈¬⊄─✔•×ü–₹。ó°ʖ—¶ķñ฿ĺ∑；⏸]',' \g<0> ' ,input_text)
    #replace the redundant space
    input_text = re.sub(r'[a-zA-Z]+',r' \g<0> ',input_text)
    input_text = re.sub(r'  +',r' ',input_text)
    return input_text

In [13]:
meaning_values2 = fit_transform(meaning_values1,method="assign_space")

100%|███████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 47711.21it/s]


## 3.3. Remove punctuations

In [14]:
def remove_punc(input_text):
    new_sentence = input_text
    new_sentence = re.sub(r'[^ a-zA-Z]',"",new_sentence).strip()
    return new_sentence

In [15]:
meaning_values3 = fit_transform(meaning_values2,method="remove_punc")

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 103965.43it/s]


## 3.4. Correcting spell mistake
 - e.g. "nt" is actually "not", "noo" is actually "no"
 - in order to do so, here i use library called pyspellchecker

In [16]:
from spellchecker import SpellChecker
def correct_spell(input_text):
    spell = SpellChecker()
    words = input_text.split()
    new_words=list()
    for word in words:
        word = spell.correction(word)
        new_words.append(word)
    sentence = " ".join(new_words)
    return sentence

In [17]:
meaning_values4 = fit_transform(meaning_values3,method="correct_spell")

100%|██████████████████████████████████████████████████████████████████████████████| 2284/2284 [05:15<00:00,  7.23it/s]


## 3.5. Remove Stopwords made handcrafted

In [18]:
print(stopwords.words('english')) # which is not suitable here, coz e.g. in emotion analysis, not happy is opposite from happy.

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
stopwords_byhand = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 
                    'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 
                    'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 
                    'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 
                    'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 
                    'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 
                    'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 
                    'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 
                    'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 
                    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 
                    'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 
                    't', 'can', 'will', 'just','now', 'd', 'll', 'm', 'o', 're', 've', 'y']
# remove stopwords except those negative words

In [20]:
def remove_stopwords(input_text):
    words = input_text.split()
    new_words=list()
    for word in words:
        if word in stopwords_byhand:
            continue
        else:
            new_words.append(word)
    sentence = " ".join(new_words)
    return sentence

In [21]:
meaning_values5 = fit_transform(meaning_values4,method="remove_stopwords")

100%|███████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 76239.44it/s]


## 3.6. shorthand words translation

In [22]:
cor_dict= {
    }# according to the not_cov, only some certain words which occured only several times need translation, which is unnecessary

In [23]:
def correct_words_common(input_text):
    'for correcting the words in sentences based on hand-crafted data'
    corrected_sentence = " "+input_text+" "
    for each_key in cor_dict.keys():
        if each_key in input_text:
            rule = " "+each_key+" "
            corrected_sentence = re.sub(rule," "+cor_dict[each_key]+" ",corrected_sentence)
    return corrected_sentence

In [24]:
meaning_values6 = fit_transform(meaning_values5,method="correct_words_common")

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 126982.19it/s]


# 5. check data format 
- for further preprocessing based on pre-trained embedding resource

## 5.1. check distinct vocabulary

In [25]:
def distinct_words(dialogs):
    vocab = dict()
    for each_dialog in tqdm(dialogs):
        text = str(each_dialog)
        words_list = text.split(sep=" ")
        for word in words_list:
            if word == '':
                continue
            try:
                vocab[word] +=1
            except KeyError:
                vocab[word] = 1
    
    return vocab

In [26]:
vocab0 = distinct_words(meaning_values)
print("the length of distinct vocabulary of {a} is {b}".format(a="the version 0",b=len(vocab0)))

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 457968.75it/s]

the length of distinct vocabulary of the version 0 is 3007





In [27]:
vocab1 = distinct_words(meaning_values1)
print("the length of distinct vocabulary of {a} is {b}".format(a="the version 1",b=len(vocab1)))

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 325378.38it/s]

the length of distinct vocabulary of the version 1 is 2940





In [28]:
vocab2 = distinct_words(meaning_values2)
print("the length of distinct vocabulary of {a} is {b}".format(a="the version 2",b=len(vocab2)))

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 286280.08it/s]

the length of distinct vocabulary of the version 2 is 2803





In [29]:
vocab3 = distinct_words(meaning_values3)
print("the length of distinct vocabulary of {a} is {b}".format(a="the version 3",b=len(vocab3)))

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 568904.94it/s]

the length of distinct vocabulary of the version 3 is 2707





In [30]:
vocab4 = distinct_words(meaning_values4)
print("the length of distinct vocabulary of {a} is {b}".format(a="the version 4",b=len(vocab4)))

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 457990.65it/s]

the length of distinct vocabulary of the version 4 is 2686





In [31]:
vocab5 = distinct_words(meaning_values5)
print("the length of distinct vocabulary of {a} is {b}".format(a="the version 5",b=len(vocab5)))

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 572577.27it/s]

the length of distinct vocabulary of the version 5 is 2638





In [32]:
vocab6 = distinct_words(meaning_values6)
print("the length of distinct vocabulary of {a} is {b}".format(a="the version 6",b=len(vocab6)))

100%|██████████████████████████████████████████████████████████████████████████| 2284/2284 [00:00<00:00, 567422.28it/s]

the length of distinct vocabulary of the version 6 is 2638





## 5.2. check the percentage of words in data can be processed by GloVe pre-trained data

In [33]:
# import processed words list from glove
GloVe_path="DataSet\\glove.840B.300d_words.pkl"
def open_pkl(path):
    pickle_file = open(path,mode='rb')
    data = pickle.load(pickle_file)
    pickle_file.close()

    return data


In [34]:
glo_words=open_pkl(GloVe_path)

In [35]:
glo_words[:10]

[',', '.', 'the', 'and', 'to', 'of', 'a', 'in', '"', ':']

In [36]:
# build the function for check the coverage of the vocabulary and the text
import operator
def check_coverage(vocab, words_list):
    cov_vocab =0
    num_vocab = len(vocab)
    cov_text = 0
    not_cov = dict()
    not_text = 0
    for word in tqdm(vocab):
        if word in words_list:
            cov_vocab += 1
            cov_text += vocab[word]
        else:
            not_cov[word]=vocab[word]
            not_text += vocab[word]
            pass
    percent_cov_vocab = cov_vocab/num_vocab
    percent_cov_text = cov_text/(cov_text+not_text)
    print("In Embedding Index we have {:.2%} coverage of distinct vocabulary".format(percent_cov_vocab))
    print("And we have {:.2%} coverage of all text".format(percent_cov_text))
    sorted_not_cov = sorted(not_cov.items(),key= operator.itemgetter(1),reverse = True)
    print("The number of words which are not covered in word2vec resource is: {0}".format(len(sorted_not_cov)))
    return sorted_not_cov

In [37]:
# original data
not_cov0 = check_coverage(vocab0, glo_words)

100%|█████████████████████████████████████████████████████████████████████████████| 3007/3007 [00:09<00:00, 322.43it/s]

In Embedding Index we have 89.92% coverage of distinct vocabulary
And we have 84.50% coverage of all text
The number of words which are not covered in word2vec resource is: 303





In [38]:
not_cov0[:10]

[('flag:', 261),
 ('family:', 25),
 ('woman,', 24),
 ('man,', 22),
 ('Laughing,', 20),
 ('out,', 18),
 ('Sad,', 18),
 ('crying\n', 18),
 ('cheeky/playful,', 17),
 ('laugh\n', 15)]

In [39]:
# lower the words
not_cov1 = check_coverage(vocab1, glo_words)

100%|█████████████████████████████████████████████████████████████████████████████| 2940/2940 [00:09<00:00, 296.27it/s]

In Embedding Index we have 89.66% coverage of distinct vocabulary
And we have 84.40% coverage of all text
The number of words which are not covered in word2vec resource is: 304





In [40]:
not_cov1[:20]

[('flag:', 261),
 ('sad,', 31),
 ('family:', 25),
 ('woman,', 24),
 ('man,', 22),
 ('laughing,', 20),
 ('out,', 18),
 ('crying\n', 18),
 ('cheeky/playful,', 17),
 ('embarrassed,', 15),
 ('fish\n', 15),
 ('laugh\n', 15),
 ('keycap:', 13),
 ('frown,', 13),
 ('angry,', 13),
 ('surprise,', 13),
 ('shock,', 13),
 ('nervous,', 13),
 ('skeptical,', 12),
 ('annoyed,', 12)]

In [41]:
# assign space to seperate adhered words
not_cov2 = check_coverage(vocab2, glo_words)

100%|█████████████████████████████████████████████████████████████████████████████| 2803/2803 [00:03<00:00, 800.25it/s]

In Embedding Index we have 98.14% coverage of distinct vocabulary
And we have 96.14% coverage of all text
The number of words which are not covered in word2vec resource is: 52





In [42]:
not_cov2[:20]

[('\n', 301),
 ('⊛', 7),
 ('dogeza', 7),
 ('"\n', 7),
 ('despai', 5),
 ('.\n', 5),
 ('intercardinal', 4),
 ('baltan', 3),
 ('merwoman', 2),
 ('🤷', 2),
 ('",', 2),
 ('!"\n', 2),
 ('".\n', 2),
 ('tichel', 1),
 ('merperson', 1),
 ('vicu', 1),
 ('orthoptera', 1),
 ('ǐ', 1),
 ('molusc', 1),
 ('jeotgarak', 1)]

In [43]:
# remove punctuations
not_cov3 = check_coverage(vocab3, glo_words)

100%|█████████████████████████████████████████████████████████████████████████████| 2707/2707 [00:02<00:00, 976.34it/s]

In Embedding Index we have 99.22% coverage of distinct vocabulary
And we have 99.56% coverage of all text
The number of words which are not covered in word2vec resource is: 21





In [44]:
not_cov3[:20]

[('dogeza', 7),
 ('despai', 5),
 ('intercardinal', 4),
 ('baltan', 3),
 ('merwoman', 2),
 ('tichel', 1),
 ('merperson', 1),
 ('vicu', 1),
 ('orthoptera', 1),
 ('molusc', 1),
 ('jeotgarak', 1),
 ('kuaizi', 1),
 ('hocho', 1),
 ('moyai', 1),
 ('withershins', 1),
 ('aesculapius', 1),
 ('clipperton', 1),
 ('czechia', 1),
 ('eswatini', 1),
 ('deflagged', 1)]

In [45]:
# correct misspell
not_cov4 = check_coverage(vocab4, glo_words)

100%|████████████████████████████████████████████████████████████████████████████| 2686/2686 [00:02<00:00, 1153.91it/s]

In Embedding Index we have 99.52% coverage of distinct vocabulary
And we have 99.81% coverage of all text
The number of words which are not covered in word2vec resource is: 13





In [46]:
not_cov4[:20]

[('intercardinal', 4),
 ('hijaz', 1),
 ('meyerson', 1),
 ('unaco', 1),
 ('orthoptera', 1),
 ('hoylake', 1),
 ('falae', 1),
 ('jeotgarak', 1),
 ('withershins', 1),
 ('aesculapius', 1),
 ('bovet', 1),
 ('clapperton', 1),
 ('eswatini', 1)]

In [47]:
# remove stopwords
not_cov5 = check_coverage(vocab5, glo_words)

100%|████████████████████████████████████████████████████████████████████████████| 2638/2638 [00:02<00:00, 1079.61it/s]

In Embedding Index we have 99.51% coverage of distinct vocabulary
And we have 99.80% coverage of all text
The number of words which are not covered in word2vec resource is: 13





In [48]:
# give shorthand a translation
not_cov6 = check_coverage(vocab6, glo_words)

100%|████████████████████████████████████████████████████████████████████████████| 2638/2638 [00:02<00:00, 1123.17it/s]

In Embedding Index we have 99.51% coverage of distinct vocabulary
And we have 99.80% coverage of all text
The number of words which are not covered in word2vec resource is: 13





# 6. save new dataframe

In [49]:
final_dict = dict()
final_dict["emoji/emoticon"]=list(emo_df["emoji/emoticon"])
final_dict["key words"]=meaning_values6
final_df = pd.DataFrame.from_dict(final_dict)

In [50]:
final_df

Unnamed: 0,emoji/emoticon,key words
0,😀,face grin grinning face
1,😃,face grinning face big eyes mouth open smile
2,😄,eye face grinning face smiling eyes mouth ope...
3,😁,beaming face smiling eyes eye face grin smile
4,😆,face grinning squinting face laugh mouth sati...
...,...,...
2279,(　´Д｀)ﾉ(´･ω･`)　ﾅﾃﾞﾅﾃﾞ,patting made made
2280,(*ﾟﾉOﾟ)<ｵｵｵｵｫｫｫｫｫｫｫｰｰｰｰｰｲ!,calling ooooh
2281,( ﾟ∀ﾟ)ｱﾊﾊ八八ﾉヽﾉヽﾉヽﾉ ＼ / ＼/ ＼,evil laugh literally ahahahaha
2282,（・∀・ ）ヾ(- -；)コラコラ,blaming


In [51]:
path = "DataSet\\emo_collection_glove.csv"
final_df.to_csv(path,sep='|',index=False)