In [1]:
import pandas as pd
import numpy as np

from config import Config

from affective_text import prepare_affective_text
from CARER import load_carer_data
from daily_dialog import load_daily_dialog_data
from emotion_detection_master import load_emotion_detection_master_data
from goemotions import load_goemotion_data
from fairy_tails import load_fairy_tail_data
from meld_master import load_meld_master_data
from ChatGPT import load_gpt_data
from Survivor import load_survivor_data
from Functions.pre_process import extract_sentiment_nltk, remove_stopwords, apply_stemming, nltk_pos_tag, spacy_ner_tag
from tqdm import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wesle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
final_dataset_name = "tes1.csv"

In [3]:
affectivetext = Config.affectivetext
carer = Config.carer
chatgpt_generated = Config.chatgpt_generated
daily_dialog = Config.daily_dialog
emotion_detection_master = Config.emotion_detection_master
fairy_tails = Config.fairy_tails
goeomotions = Config.goeomotions
meld_masters = Config.meld_masters
survivor = Config.survivor

emotion_column_name = Config.emotion_colm_name
happiness_other = Config.happiness_other
extract_sentiment = Config.extract_sentiment
get_positive = Config.get_positive
get_highest = Config.get_highest

sentiment_colmn_name = Config.sentiment_colmn_name
sentiment_library = Config.sentiment_library
keep_only_compound = Config.keep_only_compound
remove_stopword = Config.remove_stopwords
stemmer = Config.stemmer

NER_extraction = Config.NER_extraction
NER_colmn_name = Config.NER_colmn_name

POS_extraction = Config.POS_extraction
POS_colmn_name = Config.POS_colmn_name

In [4]:
final_df = pd.DataFrame()

if affectivetext:
    affective_df = prepare_affective_text()
    final_df = pd.concat([final_df, affective_df], ignore_index=True)

if carer:
    carer_df = load_carer_data()
    final_df = pd.concat([final_df, carer_df], ignore_index=True)

if daily_dialog:
    daily_dialog_df = load_daily_dialog_data()
    final_df = pd.concat([final_df, daily_dialog_df], ignore_index=True)
if emotion_detection_master: 
    emotion_detection_master_df = load_emotion_detection_master_data()
    final_df = pd.concat([final_df, emotion_detection_master_df], ignore_index=True)

if goeomotions:
    goemotions_df = load_goemotion_data()
    final_df = pd.concat([final_df, goemotions_df], ignore_index=True)

if fairy_tails:
    fairy_tail_df = load_fairy_tail_data()
    final_df = pd.concat([final_df, fairy_tail_df], ignore_index=True)

if meld_masters:
    meld_master_df = load_meld_master_data()
    final_df = pd.concat([final_df, meld_master_df], ignore_index=True)

if chatgpt_generated:
    gpt_df = load_gpt_data()
    final_df = pd.concat([final_df, gpt_df], ignore_index=True)

if survivor:
    survivor_df = load_survivor_data()
    final_df = pd.concat([final_df, survivor_df], ignore_index=True)



final_df[emotion_column_name] = final_df[emotion_column_name].replace('nan', np.nan)

In [5]:
final_df.dropna(inplace=True)

In [6]:
final_df = final_df.copy()
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Sad', Config.sad)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Happy', Config.happy)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Mad', Config.mad)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Surprised', Config.surpised)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Disgusted', Config.disgusted)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Disgust', Config.disgusted)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('fear', Config.scared)
final_df[emotion_column_name] = final_df[emotion_column_name].replace('Scared', Config.scared)

In [7]:
final_df[emotion_column_name].unique()

array(['happiness', 'sadness', 'scared', 'surprise', 'disgust', 'anger'],
      dtype=object)

In [8]:
if happiness_other:
    print("Changing the emotion column to happiness and other")
    final_df['emotion'] = final_df['emotion'].apply(lambda x: 'other' if x != 'happiness' else 'happiness')

if remove_stopword:
    print("Removing the stopwords")
    final_df['sentence'] = final_df['sentence'].progress_apply(lambda x: remove_stopwords(x))

if extract_sentiment:
    print("Extracting the sentiment")
    if sentiment_library == 'nltk':
        final_df[sentiment_colmn_name] = final_df['sentence'].progress_apply(lambda x: extract_sentiment_nltk(x, keep_only_compound=keep_only_compound, get_positive=get_positive, get_highest=get_highest))
    
if NER_extraction:
    print("Extracting the NER tags")
    final_df[NER_colmn_name] = final_df['sentence'].progress_apply(lambda x: spacy_ner_tag(x))

if POS_extraction:
    print("Extracting the POS tags")
    final_df[POS_colmn_name] = final_df['sentence'].progress_apply(lambda x: nltk_pos_tag(x))

if stemmer:
    print("Applying the stemming")
    final_df['sentence'] = final_df['sentence'].progress_apply(lambda x: apply_stemming(x))


Changing the emotion column to happiness and other


In [9]:
final_df

Unnamed: 0,sentence,emotion
0,Test to predict breast cancer relapse is approved,happiness
1,"Two Hussein allies are hanged, Iraqi official ...",other
2,Sights and sounds from CES,happiness
3,Schuey sees Ferrari unveil new car,happiness
4,Closings and cancellations top advice on flu o...,other
...,...,...
634705,"I am extremely,extremely proud",happiness
634706,Survivor sometimes can be tough to understand,other
634707,"because it's playing games, but it's serious",other
634708,I am in immense gratitude to have been able,happiness


In [10]:
final_df.to_csv(f"../processed_data/{final_dataset_name}", index=False, compression='zip')