In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
from matplotlib import pyplot as plt

pd.set_option("display.max_rows", 500)

### load data

In [4]:
df_train = pd.read_csv(
    "../twemoji/unfiltered_processed/twemoji_train.csv",
    usecols=["raw_text", "emojis", "emoji_ids", "text_no_emojis"],
)

df_valid = pd.read_csv(
    "../twemoji/unfiltered_processed/twemoji_valid.csv",
    usecols=["raw_text", "emojis", "emoji_ids", "text_no_emojis"],
)

df_test = pd.read_csv(
    "../twemoji/unfiltered_processed/twemoji_test.csv",
    usecols=["raw_text", "emojis", "emoji_ids", "text_no_emojis"],
)

In [5]:
des_df = pd.read_csv("../emoji_embedding/data/processed/emoji_descriptions.csv")

### Preprocess data and generate overviews grouped by emojis and separated by dataset

In [11]:
def get_statistics(_df):
    df = _df.copy()
    isna = df.isna().sum()

    print(isna, "\n")
    df = df.dropna()
    print("dropped nans")

    df["n_word_estimate"] = df.raw_text.apply(lambda x: len(x.split(" ")))
    df["n_characters"] = df.text_no_emojis.apply(lambda x: len(x.split(" ")))
    df.emoji_ids = df.emoji_ids.str[1:-1].str.split(", ")
    df["n_emojis"] = df.emoji_ids.apply(len)
    df["n_supporting_emoji"] = (
        df.emojis.apply(lambda x: len(list(x))) - df.n_emojis
    )  # some emojis just change the color of the main emoji

    ana = df.explode(column="emoji_ids")

    g = ana.groupby(by="emoji_ids")

    g_df = pd.concat(
        [
            g.raw_text.count().rename("n_occurence"),
            g.n_word_estimate.mean().rename("avg_n_words"),
            g.n_characters.mean().rename("avg_n_chars"),
            g.n_emojis.mean().rename("avg_n_emojis"),
            g.n_supporting_emoji.mean().rename("avg_n_supporting_emojis"),
            g.n_supporting_emoji.std().rename("std_n_supporting_emojis"),
        ],
        axis=1,
    ).reset_index()
    g_df.emoji_ids = pd.to_numeric(g_df.emoji_ids)
    g_df = g_df.rename(columns={"emoji_ids": "emoji_id"})

    res = des_df[["emoji_id", "zero_shot", "emoji_char", "emoji_name"]].merge(
        g_df, how="left"
    )
    res.n_occurence = res.n_occurence.fillna(0)
    res = res.sort_values(by="n_occurence", ascending=False)

    return res

In [12]:
%%time
# takes around 40 s
ana_train = get_statistics(df_train)

raw_text             0
emojis            2387
emoji_ids         2624
text_no_emojis     345
dtype: int64 

dropped nans
CPU times: user 39.3 s, sys: 1.54 s, total: 40.8 s
Wall time: 40.8 s


In [13]:
%%time
# takes around 3 s
ana_valid = get_statistics(df_valid)

raw_text            0
emojis            169
emoji_ids         195
text_no_emojis     18
dtype: int64 

dropped nans
CPU times: user 2.89 s, sys: 119 ms, total: 3 s
Wall time: 3 s


In [14]:
%%time
# takes around 3 s
ana_test = get_statistics(df_test)

raw_text            0
emojis            189
emoji_ids         214
text_no_emojis     29
dtype: int64 

dropped nans
CPU times: user 3.29 s, sys: 128 ms, total: 3.42 s
Wall time: 3.42 s


In [42]:
ana_train

Unnamed: 0,emoji_id,zero_shot,emoji_char,emoji_name,n_occurence,avg_n_words,avg_n_chars,avg_n_emojis,avg_n_supporting_emojis,std_n_supporting_emojis
371,371,False,😂,face_with_tears_of_joy,1365322.0,12.008224,12.008224,1.363303,0.021459,0.174552
1407,1407,False,😍,smiling_face_with_heart-eyes,551225.0,10.873101,10.873101,1.675857,0.020610,0.200769
923,923,False,😭,loudly_crying_face,513967.0,11.696105,11.696105,1.448531,0.018704,0.160883
1297,1297,False,❤️,red_heart,506658.0,13.197376,13.197376,1.886791,0.042818,0.259746
367,367,False,🙄,face_with_rolling_eyes,462780.0,11.670928,11.670928,1.243809,0.015340,0.138336
...,...,...,...,...,...,...,...,...,...,...
849,849,False,🧑‍⚖️,judge,0.0,,,,,
851,851,False,🦘,kangaroo,0.0,,,,,
855,855,False,*️⃣,keycap_asterisk,0.0,,,,,
856,856,False,8️⃣,keycap_digit_eight,0.0,,,,,


### analyze what emojis are in which datasets

In [24]:
train_emojis = set(ana_train.loc[ana_train.n_occurence > 0].emoji_id.tolist())
valid_emojis = set(ana_valid.loc[ana_valid.n_occurence > 0].emoji_id.tolist())
test_emojis = set(ana_test.loc[ana_test.n_occurence > 0].emoji_id.tolist())

In [25]:
print("number emojis in train:", len(train_emojis))
print("number emojis in valid:", len(valid_emojis))
print("number emojis in test:", len(test_emojis))

number emojis in train: 1122
number emojis in valid: 1068
number emojis in test: 1064


In [26]:
train_no_valid = train_emojis - valid_emojis
train_no_test = train_emojis - test_emojis
valid_no_train = valid_emojis - train_emojis
valid_no_test = valid_emojis - test_emojis
test_no_train = test_emojis - train_emojis
test_no_valid = test_emojis - valid_emojis
test_no_combined = test_emojis - train_emojis.union(valid_emojis)

In [29]:
print("train_no_valid:", train_no_valid)
print("train_no_test:", train_no_test)
print()
print("valid_no_train:", valid_no_train)
print("valid_no_test:", valid_no_test)
print()
print("test_no_train:", test_no_train)
print("test_no_valid:", test_no_valid)
print("test_no_combined:", test_no_combined)

train_no_valid: {896, 1, 1666, 1285, 1416, 137, 1165, 913, 146, 149, 413, 162, 1187, 1444, 1317, 39, 1066, 817, 1714, 1203, 1077, 183, 696, 1465, 702, 830, 320, 323, 68, 835, 836, 714, 332, 205, 338, 724, 1365, 216, 1626, 732, 1757, 94, 1761, 1762, 743, 879, 1263, 1265, 1775, 1139, 1651, 119, 250, 1151}
train_no_test: {896, 1, 1666, 1285, 1159, 1416, 1165, 1806, 913, 149, 284, 413, 1187, 1444, 1189, 1317, 39, 1066, 817, 1714, 1203, 1459, 1077, 1333, 183, 696, 1465, 702, 320, 323, 68, 324, 200, 332, 338, 724, 1365, 216, 1626, 732, 1372, 94, 1757, 1761, 1762, 743, 1132, 879, 1263, 1265, 1775, 1139, 1267, 1141, 1651, 119, 891, 1151}

valid_no_train: set()
valid_no_test: {324, 1189, 1159, 200, 1132, 1806, 1267, 1459, 1141, 1333, 1372, 891, 284}

test_no_train: set()
test_no_valid: {162, 835, 836, 137, 714, 205, 146, 250, 830}
test_no_combined: set()


In [40]:
# emojis that are not in the test set
ana_train.loc[list(train_no_test)].sort_values(by="n_occurence", ascending=False)

Unnamed: 0,emoji_id,zero_shot,emoji_char,emoji_name,n_occurence,avg_n_words,avg_n_chars,avg_n_emojis,avg_n_supporting_emojis,std_n_supporting_emojis
1132,1132,False,⛎,ophiuchus,81.0,14.185185,14.185185,7.209877,0.037037,0.293447
1159,1159,False,🛂,passport_control,46.0,17.173913,17.173913,5.934783,-0.152174,0.514993
891,891,False,⏮️,last_track_button,22.0,16.045455,16.045455,4.909091,0.272727,0.455842
1267,1267,False,🖨️,printer,18.0,12.277778,12.277778,4.611111,0.055556,0.416176
1139,1139,False,☦️,orthodox_cross,16.0,16.1875,16.1875,6.6875,0.0625,0.442531
413,413,False,🗄️,file_cabinet,14.0,18.142857,18.142857,6.428571,0.0,0.0
1333,1333,False,🤣,rolling_on_the_floor_laughing,11.0,17.909091,17.909091,3.0,0.0,0.0
817,817,False,♾️,infinity,9.0,16.555556,16.555556,4.777778,-0.222222,0.666667
200,200,False,🗜️,clamp,8.0,18.0,18.0,5.75,0.125,0.64087
1141,1141,False,🦉,owl,7.0,16.285714,16.285714,7.285714,0.0,0.0


### further questions

- are emojis distributed the same in all data splits 
- are there any insights when analysing the supporting statistics for ana_train, ana_valid, ana_test 
- which emojis would be best suited for zero shot prediction 
- how many datapoints would we lose if choosing a particular set of emojis for zero shotting. (including the filtering rules)
- what accuracy would a model have that could perfectly predict the top 5 emojis? 
- we could use emojis that are not in train nor in valid and test dataset as zero shot evaluation data, just have to find it on the internet, is that realistic? 
- is removing skin color etc. the right choice or would it make certain tweets meaningless (blm protest tweets)
- what years is the data from and what type of tweets do we have, are there only everyday language tweets, is there a subset of users our tweets are from ? 
- how close to whatsapp chats are the tweets? 