In [None]:
import pandas as pd

### create balanced testset

In [None]:
df = pd.read_csv("data/twemoji_test_v2.csv")
balanced = pd.read_csv("raw_data/balanced_test_plaintext.txt", sep="\t")
balanced_df = balanced[["id"]].merge(df, how = "left")

In [None]:
balanced_df.isna().sum()

In [None]:
test_raw = pd.read_csv("raw_data/raw_test.txt", sep="\t", header=None)

ids = test_raw[0].apply(lambda x: x.split(" ")[0])
tweet = test_raw[0].apply(lambda x: " ".join(x.split(" ")[1:]))
test_raw_df = pd.DataFrame({"id": ids, "tweet": tweet})

balanced_raw_df = balanced[["id"]].merge(test_raw_df, how = "left")
balanced_raw_df.index = balanced_raw_df.id

In [None]:
balanced_raw_df.loc[balanced_df.loc[balanced_df.emojis.isna()].id]

In [None]:
balanced_df = balanced_df.dropna(subset="emoji_ids")

In [None]:
balanced_df.to_csv("data/twemoji_balanced_test_v2.csv", index=False)

### create balanced validationset 

In [None]:
def create_balanced_dataset(df): 
    df["emoji_ids"] = df.emoji_ids.str[1:-1].apply(lambda x: [int(y) for y in x.split(",")])
    emoji_ids_set = df.emoji_ids.apply(set)  
    unique_emojis = df.emoji_ids.explode().value_counts()
    
    data = []
    for i, v in zip(unique_emojis.index, unique_emojis): 
        if v >= 10: 
            samples = df.loc[emoji_ids_set.apply(lambda x: i in x)].sample(10, replace=False)
        else: 
            samples = df.loc[emoji_ids_set.apply(lambda x: i in x)].sample(10, replace=True)

        data.append(samples)
        
    balanced = pd.concat(data).reset_index()
    return balanced

In [None]:
%%time
# takes around 1:20 min
df_valid = pd.read_csv("data/twemoji_valid_v2.csv")
balanced_valid = create_balanced_dataset(df_valid)

In [None]:
%%time
# takes around 18min
df_train = pd.read_csv("data/twemoji_train_v2.csv")
balanced_train = create_balanced_dataset(df_train)

In [None]:
balanced_valid.to_csv("data/twemoji_balanced_valid_v2.csv", index=False)
balanced_train.to_csv("data/twemoji_balanced_train_v2.csv", index=False)

### get prevalence data

In [None]:
prevalence = (df_train.emoji_ids.explode().value_counts()/len(df_train)).to_frame().reset_index()
prevalence.columns = ["emoji_ids", "prevalence"]

In [None]:
prevalence.to_csv("data/twemoji_prevalence.csv", index=False)