In [1]:
import pandas as pd
import numpy as np

from config import Config

from affective_text import prepare_affective_text
from CARER import load_carer_data
from daily_dialog import load_daily_dialog_data
from emotion_detection_master import load_emotion_detection_master_data
from goemotions import load_goemotion_data
from fairy_tails import load_fairy_tail_data
from meld_master import load_meld_master_data
from ChatGPT import load_gpt_data
from Survivor import load_survivor_data
from Functions.pre_process import extract_sentiment_nltk, remove_stopwords, apply_stemming, nltk_pos_tag, spacy_ner_tag
from tqdm import tqdm
import os
tqdm.pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
folder_name = "chatgpt_only"

In [3]:
folder = "../processed_data/" + folder_name

In [4]:
if not os.path.exists(folder):
    os.makedirs(folder)
    
else:
    print("Folder already exists")
    raise FileExistsError("Folder already exists")

In [5]:
affectivetext = Config.affectivetext
carer = Config.carer
chatgpt_generated = Config.chatgpt_generated
daily_dialog = Config.daily_dialog
emotion_detection_master = Config.emotion_detection_master
fairy_tails = Config.fairy_tails
goeomotions = Config.goeomotions
meld_masters = Config.meld_masters
survivor = Config.survivor

emotion_column_name = Config.emotion_colm_name
happiness_other = Config.happiness_other
extract_sentiment = Config.extract_sentiment
get_positive = Config.get_positive
get_highest = Config.get_highest

sentiment_colmn_name = Config.sentiment_colmn_name
sentiment_library = Config.sentiment_library
keep_only_compound = Config.keep_only_compound
remove_stopword = Config.remove_stopwords
stemmer = Config.stemmer

NER_extraction = Config.NER_extraction
NER_colmn_name = Config.NER_colmn_name

POS_extraction = Config.POS_extraction
POS_colmn_name = Config.POS_colmn_name

priotize_happy = Config.priotize_happy

evenly_distributed = Config.evenly_distributed

In [6]:
final_df = pd.DataFrame()
dataset_used = ""
text_prioritize_happy = ""
if affectivetext:
    affective_df = prepare_affective_text()
    final_df = pd.concat([final_df, affective_df], ignore_index=True)
    dataset_used += "- Affective Text \n"

if carer:
    carer_df = load_carer_data()
    final_df = pd.concat([final_df, carer_df], ignore_index=True)
    dataset_used += "- Carer \n"

if daily_dialog:
    daily_dialog_df = load_daily_dialog_data()
    final_df = pd.concat([final_df, daily_dialog_df], ignore_index=True)
    dataset_used += "- Daily Dialog \n"

if emotion_detection_master: 
    emotion_detection_master_df = load_emotion_detection_master_data()
    final_df = pd.concat([final_df, emotion_detection_master_df], ignore_index=True)
    dataset_used += "- Emotion Detection Master \n"


if goeomotions:
    goemotions_df = load_goemotion_data()
    final_df = pd.concat([final_df, goemotions_df], ignore_index=True)
    dataset_used += "- Goemotions \n"
    if priotize_happy:
        text_prioritize_happy = """
### GoEmotions
Sentences with multiple emotions where one of them is happiness, are prioritized to have happy emotion. \n"""
    else:
        text_prioritize_happy = " "

if fairy_tails:
    fairy_tail_df = load_fairy_tail_data()
    final_df = pd.concat([final_df, fairy_tail_df], ignore_index=True)
    dataset_used += "- Fairy Tails \n"

if meld_masters:
    meld_master_df = load_meld_master_data()
    final_df = pd.concat([final_df, meld_master_df], ignore_index=True)
    dataset_used += "- Meld Masters \n"

if chatgpt_generated:
    gpt_df = load_gpt_data()
    final_df = pd.concat([final_df, gpt_df], ignore_index=True)
    dataset_used += "- ChatGPT \n"

if survivor:
    survivor_df = load_survivor_data()
    final_df = pd.concat([final_df, survivor_df], ignore_index=True)
    dataset_used += "- Survivor \n"



final_df[emotion_column_name] = final_df[emotion_column_name].replace('nan', np.nan)

In [7]:
final_df.dropna(inplace=True)

In [8]:
final_df = final_df.copy()
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Sad', Config.sad)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Happy', Config.happy)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Mad', Config.mad)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Surprised', Config.surpised)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Disgusted', Config.disgusted)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Disgust', Config.disgusted)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('fear', Config.scared)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Scared', Config.scared)

In [9]:
final_df[emotion_column_name].unique()

array(['happiness', 'sadness', 'anger', 'surprise', 'scared', 'disgust'],
      dtype=object)

In [10]:
final_df["emotion"].value_counts()

emotion
disgust      21089
anger        11823
happiness     1901
sadness       1901
surprise      1901
scared        1901
Name: count, dtype: int64

In [11]:
if evenly_distributed:
    minimum = final_df[emotion_column_name].value_counts().min()
    for x in final_df[emotion_column_name].unique():
        num_to_drop = final_df[final_df[emotion_column_name] == x].shape[0] - minimum
        drop_indices = final_df[final_df[emotion_column_name] == x].sample(num_to_drop, random_state=1).index
        final_df = final_df.drop(drop_indices)
    text_evenly_distributed = "Emotions have been evenly distributed by randomly removing rows until all emotions have the same amount. \n"  
else:
    text_evenly_distributed = "Emotions have not been evenly distributed \n"



In [12]:
final_df["emotion"].value_counts()

emotion
disgust      21089
anger        11823
happiness     1901
sadness       1901
surprise      1901
scared        1901
Name: count, dtype: int64

In [13]:
text_senti = " "

In [14]:
if happiness_other:
    print("Changing the emotion column to happiness and other")
    final_df['emotion'] = final_df['emotion'].apply(lambda x: 'other' if x != 'happiness' else 'happiness')

if remove_stopword:
    print("Removing the stopwords")
    final_df['sentence'] = final_df['sentence'].progress_apply(lambda x: remove_stopwords(x))

if extract_sentiment:
    print("Extracting the sentiment")
    if sentiment_library == 'nltk':
        final_df[sentiment_colmn_name] = final_df['sentence'].progress_apply(lambda x: extract_sentiment_nltk(x, keep_only_compound=keep_only_compound, get_positive=get_positive, get_highest=get_highest))
        text_senti = "Sentiment is using NLTK"

    
if NER_extraction:
    print("Extracting the NER tags")
    final_df[NER_colmn_name] = final_df['sentence'].progress_apply(lambda x: spacy_ner_tag(x))

if POS_extraction:
    print("Extracting the POS tags")
    final_df[POS_colmn_name] = final_df['sentence'].progress_apply(lambda x: nltk_pos_tag(x))

if stemmer:
    print("Applying the stemming")
    final_df['sentence'] = final_df['sentence'].progress_apply(lambda x: apply_stemming(x))


In [15]:
final_df

Unnamed: 0,sentence,emotion
0,I feel overjoyed and filled with warmth knowin...,happiness
1,Her tears mingled with the rain as she stood a...,sadness
2,I can't believe you would betray my trust like...,anger
3,I can't believe it!,surprise
4,The sound of footsteps approaching in the dark...,scared
...,...,...
40511,The mere thought of their presence makes my sk...,disgust
40512,I find myself creating excuses to keep a dista...,disgust
40513,The carefully planned itinerary was suddenly w...,disgust
40514,The feeling weighs heavy like a dark cloud han...,disgust


In [16]:
text_unique_values = ""
i = 0
for x in final_df[emotion_column_name].value_counts().index:
    text_unique_values += f"- {x}"
    text_unique_values += f"({final_df[emotion_column_name].value_counts().iloc[i]})\n"
    i += 1

In [17]:
text_unique_values

'- disgust(21089)\n- anger(11823)\n- happiness(1901)\n- sadness(1901)\n- surprise(1901)\n- scared(1901)\n'

In [18]:
readme_text = f"""
### In this folder a dataset is present, with the following 'settings'/ filters / features applied:

## Dataset Information

Dataset name: {folder_name}

Datasets used:

{dataset_used}


Name of the emotion column: {emotion_column_name}


Which emotions does it contain: 

{text_unique_values}


## Preprocessing steps

Remove neutral emotions: __{happiness_other}__

Removed stopwords: __{remove_stopword}__

Extracted sentiment: __{extract_sentiment}__

Extracted NER tags: __{NER_extraction}__

Extracted POS tags: __{POS_extraction}__

{text_senti}

Applied stemming: __{stemmer}__

## Dataset Information

Number of rows: __{final_df.shape[0]}__


Number of columns: __{final_df.shape[1]}__

Compressed: __Zip__




{final_df["emotion"].value_counts()}



## Dataset Preview
{final_df.head(5).to_markdown()}


{text_prioritize_happy}


"""

In [19]:
with open(folder + '/README.md', 'w') as readme_file:
    readme_file.write(readme_text)

In [20]:
final_df.to_csv(f"../processed_data/{folder}/dataset.csv", index=False, compression='zip')