In [359]:
import os
import pandas as pd

# Needed for word swapping
import nltk
from nltk.corpus import wordnet
from gensim.models import KeyedVectors
import random
from sentence_transformers import SentenceTransformer, util

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


## SemEval 2017 Dataframe

In [360]:
def map_label(label):
    if label == "negative":
        return 0
    elif label == "neutral":
        return 1
    elif label == "positive":
        return 2
    else:
        return None

In [361]:
def process_file(file_path):
    tweets = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                tweet_id, label, tweet = parts
                mapped_label = map_label(label)
                if mapped_label is not None:
                    tweets.append(tweet)
                    labels.append(mapped_label)
    return pd.DataFrame({'Tweet': tweets, 'label': labels})

In [362]:
input_dir = 'twitter_data/custom_data/Semeval_2017/'

semeval2017_data = pd.DataFrame()

In [363]:
for file_name in os.listdir(input_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(input_dir, file_name)
        file_data = process_file(file_path)
        semeval2017_data = pd.concat([semeval2017_data, file_data], ignore_index=True)

## SemEval 2018 Dataframe

In [364]:
input_dir = 'twitter_data/custom_data/Semeval_2018/'

In [365]:
semeval2018_data = pd.DataFrame()

In [366]:
def map_emotion_to_sentiment(row):
    positive_emotions = ['joy', 'love', 'optimism', 'trust', 'anticipation']
    negative_emotions = ['anger', 'disgust', 'fear', 'pessimism', 'sadness']
    neutral_emotions = ['surprise']

    positive = any(row[emotion] == 1 for emotion in positive_emotions)
    negative = any(row[emotion] == 1 for emotion in negative_emotions)
    neutral = any(row[emotion] == 1 for emotion in neutral_emotions)

    if positive and negative:
        return 1
    elif positive:
        return 2
    elif negative:
        return 0
    elif neutral:
        return 1
    else:
        return 1

In [367]:
for file_name in os.listdir(input_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(input_dir, file_name)

        file_data = pd.read_csv(file_path, sep='\t')

        semeval2018_data = pd.concat([semeval2018_data, file_data], ignore_index=True)

In [368]:
semeval2018_data['label'] = semeval2018_data.apply(map_emotion_to_sentiment, axis=1)

## SenWave Dataset

In [371]:
# Import SenWave Dataset
# Convert the labels to positive, neutral, or negative
# Create new tweets by changing various keywords with synonyms such as COVID-19 with Corona Virus

In [372]:
def map_emotion_to_sentiment_senwave(row):
    positive_emotions = ['Optimistic', 'Thankful', 'Empathetic']
    negative_emotions = ['Pessimistic', 'Anxious', 'Sad', 'Annoyed', 'Denial']
    neutral_emotions = ['Official report', 'Joking']

    positive = any(row[emotion] == 1 for emotion in positive_emotions)
    negative = any(row[emotion] == 1 for emotion in negative_emotions)
    neutral = any(row[emotion] == 1 for emotion in neutral_emotions)

    if positive and negative:
        return 1
    elif positive:
        return 2
    elif negative:
        return 0
    elif neutral:
        return 1
    else:
        return 1

In [373]:
input_filepath = 'twitter_data/custom_data/SenWaveDataset/labeledEn.csv'

In [374]:
senwave_df = pd.read_csv(input_filepath)

In [375]:
senwave_df['label'] = senwave_df.apply(map_emotion_to_sentiment_senwave, axis=1)

## Combining Datasets

In [383]:
semeval2017_pruned_data = semeval2017_data[['Tweet','label']].copy()
semeval2018_pruned_data = semeval2018_data[['Tweet','label']].copy()
senwave_pruned_data = senwave_df[['Tweet','label']].copy()

In [384]:
semeval2017_pruned_data.to_csv('twitter_data/custom_data/semeval_2017.csv')

In [385]:
full_covid_sentiment_df = pd.concat([semeval2017_pruned_data, semeval2018_pruned_data, senwave_pruned_data], ignore_index=True)

In [386]:
full_covid_sentiment_df.to_csv('twitter_data/custom_data/full_covid_sent.csv')

## Building the Neutral only Dataset

In [424]:
full_covid_sentiment_neutral_df = full_covid_sentiment_df.copy()
full_covid_sentiment_neutral_df['label'] = full_covid_sentiment_neutral_df['label'].replace(2, 0)

In [427]:
full_covid_sentiment_neutral_df.to_csv('twitter_data/custom_data/full_covid_sentiment_neutral_dataset.csv')

## Creating the Individual Sentiment Datasets

In [387]:
# Creating Optimism Dataset
senwave_optimism = senwave_df[senwave_df['Optimistic'] == 1]
senwave_non_optimism = senwave_df[senwave_df['Optimistic'] != 1]

In [388]:
senwave_non_optimism_len = len(senwave_optimism) * 2
senwave_non_optimism = senwave_non_optimism.sample(n=senwave_non_optimism_len, random_state=23)

In [389]:
senwave_optimism_dataset = pd.concat([senwave_optimism, senwave_non_optimism], ignore_index=True)

In [390]:
senwave_optimism_dataset = senwave_optimism_dataset[['Tweet', 'Optimistic']]
senwave_optimism_dataset.rename(columns={'Optimistic': 'label'}, inplace=True)

In [391]:
# Creating Pessimistic Dataset
senwave_pessimistic = senwave_df[senwave_df['Pessimistic'] == 1]
senwave_non_pessimistic = senwave_df[senwave_df['Pessimistic'] != 1]

In [392]:
senwave_non_pessimistic_len = len(senwave_pessimistic) * 2
senwave_non_pessimistic = senwave_non_pessimistic.sample(n=senwave_non_pessimistic_len, random_state=23)

In [393]:
senwave_pessimistic_dataset = pd.concat([senwave_pessimistic, senwave_non_pessimistic], ignore_index=True)

In [394]:
senwave_pessimistic_dataset = senwave_pessimistic_dataset[['Tweet', 'Pessimistic']]
senwave_pessimistic_dataset.rename(columns={'Pessimistic': 'label'}, inplace=True)

In [395]:
# Creating Sad Dataset
senwave_sad = senwave_df[senwave_df['Sad'] == 1]
senwave_non_sad = senwave_df[senwave_df['Sad'] != 1]

In [396]:
senwave_non_sad_len = round(len(senwave_sad) * 2)
senwave_non_sad = senwave_non_sad.sample(n=senwave_non_sad_len, random_state=23)

In [397]:
senwave_sad_dataset = pd.concat([senwave_sad, senwave_non_sad], ignore_index=True)

In [398]:
senwave_sad_dataset = senwave_sad_dataset[['Tweet', 'Sad']]
senwave_sad_dataset.rename(columns={'Sad': 'label'}, inplace=True)

In [399]:
# Creating Annoyed Dataset
senwave_annoyed = senwave_df[senwave_df['Annoyed'] == 1]
senwave_non_annoyed = senwave_df[senwave_df['Annoyed'] != 1]

In [400]:
senwave_non_annoyed_len = round(len(senwave_annoyed) * 1.8)
senwave_non_annoyed = senwave_non_annoyed.sample(n=senwave_non_annoyed_len, random_state=23)

In [401]:
senwave_annoyed_dataset = pd.concat([senwave_annoyed, senwave_non_annoyed], ignore_index=True)

In [402]:
senwave_annoyed_dataset = senwave_annoyed_dataset[['Tweet', 'Annoyed']]
senwave_annoyed_dataset.rename(columns={'Annoyed': 'label'}, inplace=True)

In [403]:
# Creating Joking Dataset
senwave_joking = senwave_df[senwave_df['Joking'] == 1]
senwave_non_joking = senwave_df[senwave_df['Joking'] != 1]

In [404]:
senwave_non_joking_len = round(len(senwave_joking) * 1.1)
senwave_non_joking = senwave_non_joking.sample(n=senwave_non_joking_len, random_state=23)

In [405]:
senwave_joking_dataset = pd.concat([senwave_joking, senwave_non_joking], ignore_index=True)

In [406]:
senwave_joking_dataset = senwave_joking_dataset[['Tweet', 'Joking']]
senwave_joking_dataset.rename(columns={'Joking': 'label'}, inplace=True)

In [407]:
# Saving Individual Emotion Datasets
senwave_optimism_dataset.to_csv('twitter_data/custom_data/optimism_dataset.csv')
senwave_pessimistic_dataset.to_csv('twitter_data/custom_data/pessimistic_dataset.csv')
senwave_sad_dataset.to_csv('twitter_data/custom_data/sad_dataset.csv')
senwave_annoyed_dataset.to_csv('twitter_data/custom_data/annoyed_dataset.csv')
senwave_joking_dataset.to_csv('twitter_data/custom_data/joking_dataset.csv')