In [1]:
# Emoji dataset

# https://huggingface.co/datasets/arbml/emoji_sentiment_lexicon

In [2]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

dataset = load_dataset("arbml/emoji_sentiment_lexicon")

df = pd.DataFrame(dataset['train'])

print(df.head())

  Unnamed: 0             Emoji_ID Emoji                    Unicode_Name  \
0          1  b'\xf0\x9f\x98\x82'     😂         Face With Tears Of Joy    
1          2  b'\xf0\x9f\x92\x94'     💔                   Broken Heart    
2          3      b'\xe2\x9d\xa4'     ❤                      Red Heart    
3          4  b'\xf0\x9f\x98\xad'     😭             Loudly Crying Face    
4          5  b'\xf0\x9f\x98\x8d'     😍   Smiling Face With Heart-Eyes    

           Arabic_Name              Class Total_Occurrence  \
0      وجه بدموع الفرح  Facial Expression            25908   
1            قلب مجروح              Heart            18564   
2             قلب احمر              Heart            15876   
3   وجه يبكي بصوت عالي  Facial Expression            12318   
4  وجه مبتسم بعيون قلب  Facial Expression             6815   

  Negativity_Occurrence_N(Negative) Neutrality_Occurrence_N(Neutral)  \
0                              8106                             2638   
1                             17

In [4]:
print(df.columns)

Index(['Unnamed: 0', 'Emoji_ID', 'Emoji', 'Unicode_Name', 'Arabic_Name',
       'Class', 'Total_Occurrence', 'Negativity_Occurrence_N(Negative)',
       'Neutrality_Occurrence_N(Neutral)', 'Positivity_Occurrence_N(Positive)',
       'Negativite_Probability_P(Negative)', 'Neutral_Probability_P(Neutral)',
       'Positive_Probability_P(Positive)', 'Sentiment_Score_S', 'label'],
      dtype='object')


In [5]:
df = df[['Unicode_Name', 'label']]

print(df.head())

                     Unicode_Name  label
0         Face With Tears Of Joy       2
1                   Broken Heart       0
2                      Red Heart       2
3             Loudly Crying Face       0
4   Smiling Face With Heart-Eyes       2


In [6]:
df_clean = df

# Renombrar las columnas
df_clean.rename(columns={'Unicode_Name': 'clean_text', 'label': 'category'}, inplace=True)

# Reasignar los valores en la columna 'category'
df_clean['category'] = df_clean['category'].map({0: -1, 1: 0, 2: 1})

df_clean['clean_text'] = df_clean['clean_text'].str.strip().str.lower().str.replace(' ', '_')

In [7]:
# Mostrar las primeras filas del DataFrame modificado
print("DataFrame after renaming and reassigning values:")
print(df_clean.head(67))

DataFrame after renaming and reassigning values:
                      clean_text  category
0         face_with_tears_of_joy         1
1                   broken_heart        -1
2                      red_heart         1
3             loudly_crying_face        -1
4   smiling_face_with_heart-eyes         1
..                           ...       ...
62                  flushed_face        -1
63                 confused_face        -1
64            see-no-evil_monkey        -1
65                          herb         1
66              person_shrugging         0

[67 rows x 2 columns]


In [8]:
df_clean.to_csv('emoji_sentiment_dataset.csv', index=False)

print("DataFrame exported to 'emoji_sentiment_dataset.csv'")

DataFrame exported to 'emoji_sentiment_dataset.csv'


## Fusion de los datasets

In [10]:
emoji_dataset = pd.read_csv('../datasets/emoji_sentiment_dataset.csv')
text_dataset = pd.read_csv('../datasets/Twitter_Data.csv')


# Concatenar los DataFrames verticalmente
df_combined = pd.concat([text_dataset, emoji_dataset], ignore_index=True)

# Mostrar las primeras filas del DataFrame combinado
print("Combined DataFrame:")
print(df_combined.head())

# Exportar el DataFrame combinado a un archivo CSV
df_combined.to_csv('combined_text_emoji_sentiment_dataset.csv', index=False)

print("Combined DataFrame exported to 'combined_emoji_sentiment_dataset.csv'")


Combined DataFrame:
                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0
Combined DataFrame exported to 'combined_emoji_sentiment_dataset.csv'
