In [None]:
import os
import pandas as pd

# Needed for word swapping
import nltk
from nltk.corpus import wordnet
from gensim.models import KeyedVectors
import random
from sentence_transformers import SentenceTransformer, util

## SemEval 2017 Dataframe

In [None]:
def map_label(label):
    if label == "negative":
        return 0
    elif label == "neutral":
        return 1
    elif label == "positive":
        return 2
    else:
        return None

In [None]:
def process_file(file_path):
    tweets = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                tweet_id, label, tweet = parts
                mapped_label = map_label(label)
                if mapped_label is not None:
                    tweets.append(tweet)
                    labels.append(mapped_label)
    return pd.DataFrame({'Tweet': tweets, 'label': labels})

In [None]:
input_dir = 'twitter_data/custom_data/Semeval_2017/'

semeval2017_data = pd.DataFrame()

In [None]:
for file_name in os.listdir(input_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(input_dir, file_name)
        file_data = process_file(file_path)
        semeval2017_data = pd.concat([semeval2017_data, file_data], ignore_index=True)

## SemEval 2018 Dataframe

In [None]:
input_dir = 'twitter_data/custom_data/Semeval_2018/'

In [None]:
semeval2018_data = pd.DataFrame()

In [None]:
def map_emotion_to_sentiment(row):
    positive_emotions = ['joy', 'love', 'optimism', 'trust', 'anticipation']
    negative_emotions = ['anger', 'disgust', 'fear', 'pessimism', 'sadness']
    neutral_emotions = ['surprise']

    positive = any(row[emotion] == 1 for emotion in positive_emotions)
    negative = any(row[emotion] == 1 for emotion in negative_emotions)
    neutral = any(row[emotion] == 1 for emotion in neutral_emotions)

    if positive and negative:
        return 1
    elif positive:
        return 2
    elif negative:
        return 0
    elif neutral:
        return 1
    else:
        return 1

In [None]:
for file_name in os.listdir(input_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(input_dir, file_name)

        file_data = pd.read_csv(file_path, sep='\t')

        semeval2018_data = pd.concat([semeval2018_data, file_data], ignore_index=True)

In [None]:
semeval2018_data['label'] = semeval2018_data.apply(map_emotion_to_sentiment, axis=1)

## SenWave Dataset

In [None]:
# Import SenWave Dataset
# Convert the labels to positive, neutral, or negative
# Create new tweets by changing various keywords with synonyms such as COVID-19 with Corona Virus

In [None]:
def map_emotion_to_sentiment_senwave(row):
    positive_emotions = ['Optimistic', 'Thankful', 'Empathetic']
    negative_emotions = ['Pessimistic', 'Anxious', 'Sad', 'Annoyed', 'Denial']
    neutral_emotions = ['Official report', 'Joking']

    positive = any(row[emotion] == 1 for emotion in positive_emotions)
    negative = any(row[emotion] == 1 for emotion in negative_emotions)
    neutral = any(row[emotion] == 1 for emotion in neutral_emotions)

    if positive and negative:
        return 1
    elif positive:
        return 2
    elif negative:
        return 0
    elif neutral:
        return 1
    else:
        return 1

In [None]:
input_filepath = 'twitter_data/custom_data/SenWaveDataset/labeledEn.csv'

In [None]:
senwave_df = pd.read_csv(input_filepath)

In [None]:
senwave_df['label'] = senwave_df.apply(map_emotion_to_sentiment_senwave, axis=1)

## Combining Datasets

In [None]:
semeval2017_pruned_data = semeval2017_data[['Tweet','label']].copy()
semeval2018_pruned_data = semeval2018_data[['Tweet','label']].copy()
senwave_pruned_data = senwave_df[['Tweet','label']].copy()

In [None]:
semeval2017_pruned_data.to_csv('twitter_data/custom_data/semeval_2017.csv')

In [None]:
full_covid_sentiment_df = pd.concat([semeval2017_pruned_data, semeval2018_pruned_data, senwave_pruned_data], ignore_index=True)

In [None]:
full_covid_sentiment_df.to_csv('twitter_data/custom_data/full_covid_sent.csv')

## Building the Neutral only Dataset

In [None]:
full_covid_sentiment_neutral_df = full_covid_sentiment_df.copy()
full_covid_sentiment_neutral_df['label'] = full_covid_sentiment_neutral_df['label'].replace(2, 0)

In [None]:
full_covid_sentiment_neutral_df.to_csv('twitter_data/custom_data/full_covid_sentiment_neutral_dataset.csv')

## Creating the Individual Sentiment Datasets

In [None]:
goemotion_1_fp = 'twitter_data/custom_data/goemotions_1.csv'
goemotion_2_fp = 'twitter_data/custom_data/goemotions_2.csv'
goemotion_3_fp = 'twitter_data/custom_data/goemotions_3.csv'

goemotion_1_df = pd.read_csv(goemotion_1_fp)
goemotion_2_df = pd.read_csv(goemotion_2_fp)
goemotion_3_df = pd.read_csv(goemotion_3_fp)

In [None]:
goemotion_df = pd.concat([goemotion_1_df, goemotion_2_df, goemotion_3_df], ignore_index=True)
goemotion_df = goemotion_df[goemotion_df['example_very_unclear'] == False]

In [None]:
numeric_cols = goemotion_df.select_dtypes(include='number').columns
numeric_cols = numeric_cols.drop(['created_utc', 'rater_id'])

In [None]:
numeric_cols

In [None]:
## Groups of Emotions:
# - Love: Admiration, Gratitude, Caring, Love
# - Joyful: Amusement, Excitement, Joy, Optimism

# - Anger: Anger, Annoyance, Disgust
# - Sadness: Disappointment, Sadness, Grief

# Other: Approval, Relief, Curiosity, Desire, Pride, Embarrassment, Remorse, Fear, Nervousness

### Love Emotion Group

In [None]:
love_df = goemotion_df[
    (goemotion_df[['admiration', 'gratitude', 'caring', 'love']].any(axis=1)) &
    (goemotion_df[numeric_cols].drop(columns=['admiration', 'gratitude', 'caring', 'love']).sum(axis=1) == 0)
]
non_love_df = goemotion_df[~(goemotion_df[['admiration', 'gratitude', 'caring', 'love']].any(axis=1))]

In [None]:
non_love_len = round(len(love_df) * 2)
non_love_df = non_love_df.sample(n=non_love_len, random_state=23)

In [None]:
love_df = love_df[['text']]
love_df['label'] = 1

In [None]:
non_love_df = non_love_df[['text']]
non_love_df['label'] = 0

In [None]:
goemotion_love_df = pd.concat([love_df, non_love_df], ignore_index=True)

In [None]:
goemotion_love_df.to_csv('twitter_data/custom_data/geoemotion_love_group.csv')

In [None]:
goemotion_love_df

### Joyful Emotion Group

In [None]:
joyful_df = goemotion_df[
    (goemotion_df[['amusement', 'excitement', 'joy', 'optimism']].any(axis=1)) &
    (goemotion_df[numeric_cols].drop(columns=['amusement', 'excitement', 'joy', 'optimism']).sum(axis=1) == 0)
]
non_joyful_df = goemotion_df[~(goemotion_df[['amusement', 'excitement', 'joy', 'optimism']].any(axis=1))]

In [None]:
non_joyful_len = round(len(joyful_df) * 2)
non_joyful_df = non_joyful_df.sample(n=non_joyful_len, random_state=23)

In [None]:
joyful_df = joyful_df[['text']]
joyful_df['label'] = 1

In [None]:
non_joyful_df = non_joyful_df[['text']]
non_joyful_df['label'] = 0

In [None]:
goemotion_joyful_df = pd.concat([joyful_df, non_joyful_df], ignore_index=True)

In [None]:
goemotion_joyful_df.to_csv('twitter_data/custom_data/geoemotion_joyful_group.csv')

### Anger Emotion Group

In [None]:
anger_df = goemotion_df[
    (goemotion_df[['anger', 'annoyance', 'disgust']].any(axis=1)) &
    (goemotion_df[numeric_cols].drop(columns=['anger', 'annoyance', 'disgust']).sum(axis=1) == 0)
]
non_anger_df = goemotion_df[~(goemotion_df[['anger', 'annoyance', 'disgust']].any(axis=1))]

In [None]:
non_anger_len = round(len(anger_df) * 2)
non_anger_df = non_anger_df.sample(n=non_anger_len, random_state=23)

In [None]:
anger_df = anger_df[['text']]
anger_df['label'] = 1

In [None]:
non_anger_df = non_anger_df[['text']]
non_anger_df['label'] = 0

In [None]:
goemotion_anger_df = pd.concat([anger_df, non_anger_df], ignore_index=True)

In [None]:
goemotion_anger_df.to_csv('twitter_data/custom_data/geoemotion_anger_group.csv')

### Sadness Emotion Group

In [None]:
sadness_df = goemotion_df[
    (goemotion_df[['disappointment', 'sadness', 'grief']].any(axis=1)) &
    (goemotion_df[numeric_cols].drop(columns=['disappointment', 'sadness', 'grief']).sum(axis=1) == 0)
]
non_sadness_df = goemotion_df[~(goemotion_df[['disappointment', 'sadness', 'grief']].any(axis=1))]

In [None]:
non_sadness_len = round(len(sadness_df) * 2)
non_sadness_df = non_sadness_df.sample(n=non_sadness_len, random_state=23)

In [None]:
sadness_df = sadness_df[['text']]
sadness_df['label'] = 1

In [None]:
non_sadness_df = non_sadness_df[['text']]
non_sadness_df['label'] = 0

In [None]:
goemotion_sadness_df = pd.concat([sadness_df, non_sadness_df], ignore_index=True)

In [None]:
goemotion_sadness_df.to_csv('twitter_data/custom_data/geoemotion_sadness_group.csv')

### Fear Emotion Group

In [None]:
fear_df = goemotion_df[
    (goemotion_df[['fear', 'nervousness']].any(axis=1)) &
    (goemotion_df[numeric_cols].drop(columns=['fear', 'nervousness']).sum(axis=1) == 0)
]
non_fear_df = goemotion_df[~(goemotion_df[['fear', 'nervousness']].any(axis=1))]

In [None]:
non_fear_len = round(len(fear_df) * 2)
non_fear_df = non_fear_df.sample(n=non_fear_len, random_state=23)

In [None]:
fear_df = fear_df[['text']]
fear_df['label'] = 1

In [None]:
non_fear_df = non_fear_df[['text']]
non_fear_df['label'] = 0

In [None]:
goemotion_fear_df = pd.concat([fear_df, non_fear_df], ignore_index=True)

In [None]:
goemotion_fear_df.to_csv('twitter_data/custom_data/geoemotion_fear_group.csv')