In [49]:
import os
import pandas as pd

## SemEval 2017 Dataframe

In [50]:
def map_label(label):
    if label == "negative":
        return 0
    elif label == "neutral":
        return 1
    elif label == "positive":
        return 2
    else:
        return None

In [51]:
def process_file(file_path):
    tweets = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                tweet_id, label, tweet = parts
                mapped_label = map_label(label)
                if mapped_label is not None:
                    tweets.append(tweet)
                    labels.append(mapped_label)
    return pd.DataFrame({'Tweet': tweets, 'label': labels})

In [52]:
input_dir = 'twitter_data/custom_data/Semeval_2017/'

semeval2017_data = pd.DataFrame()

In [53]:
for file_name in os.listdir(input_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(input_dir, file_name)
        file_data = process_file(file_path)
        semeval2017_data = pd.concat([semeval2017_data, file_data], ignore_index=True)

## SemEval 2018 Dataframe

In [54]:
input_dir = 'twitter_data/custom_data/Semeval_2018/'

In [55]:
semeval2018_data = pd.DataFrame()

In [56]:
def map_emotion_to_sentiment(row):
    positive_emotions = ['joy', 'love', 'optimism', 'trust', 'anticipation']
    negative_emotions = ['anger', 'disgust', 'fear', 'pessimism', 'sadness']
    neutral_emotions = ['surprise']

    positive = any(row[emotion] == 1 for emotion in positive_emotions)
    negative = any(row[emotion] == 1 for emotion in negative_emotions)
    neutral = any(row[emotion] == 1 for emotion in neutral_emotions)

    if positive and negative:
        return 1
    elif positive:
        return 2
    elif negative:
        return 0
    elif neutral:
        return 1
    else:
        return 1

In [57]:
for file_name in os.listdir(input_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(input_dir, file_name)

        file_data = pd.read_csv(file_path, sep='\t')

        semeval2018_data = pd.concat([semeval2018_data, file_data], ignore_index=True)

In [58]:
semeval2018_data['label'] = semeval2018_data.apply(map_emotion_to_sentiment, axis=1)

In [59]:
len(semeval2018_data[semeval2018_data['label'] == 0])

3614

In [60]:
semeval2018_data

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust,label
0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1,2
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0,2
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0,1
3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0,2
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7719,2018-En-01993,@BadHombreNPS @SecretaryPerry If this didn't m...,1,0,1,0,0,0,0,0,0,0,0,0
7720,2018-En-01784,Excited to watch #stateoforigin tonight! Come ...,0,0,0,0,1,0,1,0,0,0,0,2
7721,2018-En-04047,"Blah blah blah Kyrie, IT, etc. @CJC9BOSS leavi...",1,0,1,0,0,0,0,0,1,0,0,0
7722,2018-En-03041,#ThingsIveLearned The wise #shepherd never tru...,0,0,0,0,0,0,0,0,0,0,0,1


## SenWave Dataset

In [61]:
# Import SenWave Dataset
# Convert the labels to positive, neutral, or negative
# Create new tweets by changing various keywords with synonyms such as COVID-19 with Corona Virus

In [62]:
def map_emotion_to_sentiment_senwave(row):
    positive_emotions = ['Optimistic', 'Thankful', 'Empathetic']
    negative_emotions = ['Pessimistic', 'Anxious', 'Sad', 'Annoyed', 'Denial']
    neutral_emotions = ['Official report', 'Joking']

    positive = any(row[emotion] == 1 for emotion in positive_emotions)
    negative = any(row[emotion] == 1 for emotion in negative_emotions)
    neutral = any(row[emotion] == 1 for emotion in neutral_emotions)

    if positive and negative:
        return 1
    elif positive:
        return 2
    elif negative:
        return 0
    elif neutral:
        return 1
    else:
        return 1

In [63]:
input_filepath = 'twitter_data/custom_data/SenWaveDataset/labeledEn.csv'

In [64]:
senwave_df = pd.read_csv(input_filepath)

In [65]:
senwave_df['label'] = senwave_df.apply(map_emotion_to_sentiment_senwave, axis=1)

## Combining Datasets

In [66]:
semeval2017_pruned_data = semeval2017_data[['Tweet','label']].copy()
semeval2018_pruned_data = semeval2018_data[['Tweet','label']].copy()
senwave_pruned_data = senwave_df[['Tweet','label']].copy()

In [71]:
len(semeval2017_pruned_data[semeval2017_pruned_data['label'] == 2])

19903

## Individual Emotion Dataset

In [None]:
# Combine SenWave and Semeval 2018 dataset to create individual emotion datasets for optimism, or sadness
# These datasets could also be padded with extra data as well using the synonym changes
# These datasets should just be 0 or 1, false or true respectively for each emotion